{"id":"https://openalex.org/W4392902860","doi":"https://doi.org/10.1109/icassp48485.2024.10448121","title":"Mapache: Masked Parallel Transformer for Advanced Speech Editing and Synthesis","display_name":"Mapache: Masked Parallel Transformer for Advanced Speech Editing and Synthesis","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392902860","doi":"https://doi.org/10.1109/icassp48485.2024.10448121"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10448121","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10448121","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5090819045","display_name":"Guillermo C\u00e1mbara","orcid":"https://orcid.org/0000-0002-8047-7700"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Guillermo C\u00e1mbara","raw_affiliation_strings":["Amazon Science"],"affiliations":[{"raw_affiliation_string":"Amazon Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050897938","display_name":"Patrick Lumban Tobing","orcid":"https://orcid.org/0000-0003-2792-8418"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patrick Lumban Tobing","raw_affiliation_strings":["Amazon Science"],"affiliations":[{"raw_affiliation_string":"Amazon Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047648376","display_name":"Mikolaj Babianski","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mikolaj Babianski","raw_affiliation_strings":["Amazon Science"],"affiliations":[{"raw_affiliation_string":"Amazon Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069973475","display_name":"Ravichander Vipperla","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ravichander Vipperla","raw_affiliation_strings":["Amazon Science"],"affiliations":[{"raw_affiliation_string":"Amazon Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100334977","display_name":"Duo Wang","orcid":"https://orcid.org/0000-0003-1435-4961"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Duo Wang","raw_affiliation_strings":["Amazon Science"],"affiliations":[{"raw_affiliation_string":"Amazon Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038367875","display_name":"Ron Shmelkin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ron Shmelkin","raw_affiliation_strings":["Amazon Science"],"affiliations":[{"raw_affiliation_string":"Amazon Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068933954","display_name":"Giuseppe Coccia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Giuseppe Coccia","raw_affiliation_strings":["Amazon Science"],"affiliations":[{"raw_affiliation_string":"Amazon Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083300352","display_name":"Orazio Angelini","orcid":"https://orcid.org/0000-0002-5057-7866"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Orazio Angelini","raw_affiliation_strings":["Amazon Science"],"affiliations":[{"raw_affiliation_string":"Amazon Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083043476","display_name":"Arnaud Joly","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Arnaud Joly","raw_affiliation_strings":["Amazon Science"],"affiliations":[{"raw_affiliation_string":"Amazon Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032468476","display_name":"Mateusz \u0141ajszczak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mateusz Lajszczak","raw_affiliation_strings":["Amazon Science"],"affiliations":[{"raw_affiliation_string":"Amazon Science","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001028439","display_name":"Vincent Pollet","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vincent Pollet","raw_affiliation_strings":["Amazon Science"],"affiliations":[{"raw_affiliation_string":"Amazon Science","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5090819045"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.7471,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.65074691,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"10691","last_page":"10695"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8132039308547974},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6451781988143921},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6429305076599121},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.6042655110359192},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5372609496116638},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5106208920478821},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.48664984107017517},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4496206045150757},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.41680964827537537},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.41657713055610657},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.40047985315322876},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.08771267533302307}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8132039308547974},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6451781988143921},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6429305076599121},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.6042655110359192},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5372609496116638},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5106208920478821},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.48664984107017517},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4496206045150757},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.41680964827537537},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.41657713055610657},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40047985315322876},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.08771267533302307},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10448121","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10448121","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.5400000214576721,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1597121597","https://openalex.org/W2962834855","https://openalex.org/W2963799213","https://openalex.org/W2981087920","https://openalex.org/W3036167779","https://openalex.org/W3197273793","https://openalex.org/W3215615641","https://openalex.org/W4224035735","https://openalex.org/W4252812408","https://openalex.org/W4288089799","https://openalex.org/W4301581299","https://openalex.org/W4303647933","https://openalex.org/W4307323391","https://openalex.org/W4312933868","https://openalex.org/W4313021454","https://openalex.org/W4313484371","https://openalex.org/W4313679638","https://openalex.org/W4366460484","https://openalex.org/W4376632512","https://openalex.org/W4377010126","https://openalex.org/W4381786045","https://openalex.org/W4390872297","https://openalex.org/W4400111385","https://openalex.org/W6769627184","https://openalex.org/W6771467084","https://openalex.org/W6779192484","https://openalex.org/W6779823529","https://openalex.org/W6799174933","https://openalex.org/W6805710207"],"related_works":["https://openalex.org/W1546240199","https://openalex.org/W193702574","https://openalex.org/W2126322296","https://openalex.org/W2063862874","https://openalex.org/W2164147372","https://openalex.org/W2053531689","https://openalex.org/W2154415461","https://openalex.org/W2550171623","https://openalex.org/W1842536210","https://openalex.org/W642007152"],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,89,117],"Generative":[3],"AI,":[4],"such":[5,134],"as":[6],"scaled":[7],"Transformer":[8],"large":[9],"language":[10,27,75],"models":[11,33,54,114],"(LLM)":[12],"and":[13,28,101],"diffusion":[14,79],"decoders,":[15],"have":[16,34],"revolutionized":[17],"speech":[18,21,41,47,57,74,91],"synthesis.":[19],"With":[20],"encompassing":[22],"the":[23,130],"complexities":[24],"of":[25,39,111,120],"natural":[26],"audio":[29,61],"dimensionality,":[30],"many":[31],"recent":[32],"relied":[35],"on":[36,128],"autoregressive":[37],"modeling":[38],"quantized":[40],"tokens.":[42],"Such":[43],"an":[44],"approach":[45],"limits":[46],"synthesis":[48,103],"to":[49,96],"left-to-right":[50],"generation,":[51],"making":[52],"these":[53,121],"unsuitable":[55],"for":[56,133],"edits":[58],"free":[59],"from":[60],"discontinuities.":[62],"We":[63],"introduce":[64],"Mapache,":[65],"a":[66,71,82],"novel":[67],"architecture":[68],"that":[69,93,105,115],"combines":[70],"non-autoregressive":[72,135],"masked":[73],"model":[76],"with":[77],"acoustic":[78],"modeling,":[80],"offering":[81],"unique,":[83],"fully":[84],"parallel":[85],"pipeline.":[86],"Mapache":[87],"excels":[88],"precise":[90],"editing":[92],"is":[94],"indiscernible":[95],"human":[97],"listeners,":[98],"exhibiting":[99],"inpainting":[100],"zero-shot":[102],"capabilities":[104],"either":[106],"surpass":[107],"or":[108],"rival":[109],"those":[110],"other":[112],"state-of-the-art":[113],"specialize":[116],"just":[118],"one":[119],"tasks.":[122],"This":[123],"paper":[124],"also":[125],"sheds":[126],"light":[127],"optimizing":[129],"decoding":[131],"process":[132],"models.":[136]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
