{"id":"https://openalex.org/W4403780667","doi":"https://doi.org/10.1145/3664647.3681539","title":"Convert and Speak: Zero-shot Accent Conversion with Minimum Supervision","display_name":"Convert and Speak: Zero-shot Accent Conversion with Minimum Supervision","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4403780667","doi":"https://doi.org/10.1145/3664647.3681539"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3681539","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681539","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2408.10096","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhijun Jia","orcid":"https://orcid.org/0009-0000-8842-2088"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhijun Jia","raw_affiliation_strings":["Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0009-0000-8842-2088","affiliations":[{"raw_affiliation_string":"Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031445620","display_name":"Huaying Xue","orcid":"https://orcid.org/0009-0002-7110-5574"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huaying Xue","raw_affiliation_strings":["Microsoft Research Asia, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0002-7110-5574","affiliations":[{"raw_affiliation_string":"Microsoft Research Asia, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024977056","display_name":"Xiulian Peng","orcid":"https://orcid.org/0000-0001-8213-4878"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiulian Peng","raw_affiliation_strings":["Microsoft Research Asia, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-8213-4878","affiliations":[{"raw_affiliation_string":"Microsoft Research Asia, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100756584","display_name":"Yan Lu","orcid":"https://orcid.org/0000-0001-5383-6424"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yan Lu","raw_affiliation_strings":["Microsoft Research Asia, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5383-6424","affiliations":[{"raw_affiliation_string":"Microsoft Research Asia, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I881766915"],"apc_list":null,"apc_paid":null,"fwci":0.6623,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.75813366,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"4446","last_page":"4454"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9695000052452087,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9695000052452087,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/stress","display_name":"Stress (linguistics)","score":0.790013313293457},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.7297881841659546},{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.6030381917953491},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5897865295410156},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4054446220397949},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3808457553386688},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.37950077652931213},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.2837095260620117},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.08531343936920166},{"id":"https://openalex.org/keywords/materials-science","display_name":"Materials science","score":0.08203563094139099}],"concepts":[{"id":"https://openalex.org/C2776756274","wikidata":"https://www.wikidata.org/wiki/Q181767","display_name":"Stress (linguistics)","level":2,"score":0.790013313293457},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.7297881841659546},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.6030381917953491},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5897865295410156},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4054446220397949},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3808457553386688},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.37950077652931213},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.2837095260620117},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.08531343936920166},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.08203563094139099},{"id":"https://openalex.org/C191897082","wikidata":"https://www.wikidata.org/wiki/Q11467","display_name":"Metallurgy","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3664647.3681539","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681539","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2408.10096","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.10096","pdf_url":"https://arxiv.org/pdf/2408.10096","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2408.10096","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.10096","pdf_url":"https://arxiv.org/pdf/2408.10096","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.44999998807907104,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4403780667.pdf"},"referenced_works_count":18,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2743984732","https://openalex.org/W2888954148","https://openalex.org/W2890402938","https://openalex.org/W2964243274","https://openalex.org/W2972359262","https://openalex.org/W2973142754","https://openalex.org/W3015430779","https://openalex.org/W3161627112","https://openalex.org/W3178546316","https://openalex.org/W3204009030","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W3215615641","https://openalex.org/W4296069266","https://openalex.org/W4367032484","https://openalex.org/W4377231659","https://openalex.org/W4381786045"],"related_works":["https://openalex.org/W2074502265","https://openalex.org/W4214877189","https://openalex.org/W2773965352","https://openalex.org/W2381179799","https://openalex.org/W4367680763","https://openalex.org/W2980279061","https://openalex.org/W2334685461","https://openalex.org/W2739335048","https://openalex.org/W1506224037","https://openalex.org/W4213177143"],"abstract_inverted_index":{"Low":[0],"resource":[1],"of":[2,9,76,94,111,121,132,178,204],"parallel":[3,83,122,180],"data":[4,84,103,181],"is":[5,37,48,139,183],"the":[6,16,35,41,46,52,69,82,87,92,99,102,109,119,128,160,187],"key":[7],"challenge":[8],"accent":[10,63,78,123,167,194],"conversion(AC)":[11],"problem":[12],"in":[13,33,61,166],"which":[14,34,182],"both":[15],"pronunciation":[17],"units":[18],"and":[19,45,80,107,130,171],"prosody":[20],"pattern":[21],"need":[22,120],"to":[23,72,115,141,154,186,210],"be":[24],"converted.":[25],"We":[26],"propose":[27],"a":[28,57,134,201],"two-stage":[29],"generative":[30,59,137],"framework":[31,162,199],"\"convert-and-speak\"":[32],"conversion":[36,157],"only":[38,175],"operated":[39],"on":[40,51,152],"semantic":[42,54,95],"token":[43,55,96],"level":[44],"speech":[47,58,79,124,169],"synthesized":[49],"conditioned":[50],"converted":[53],"with":[56,91,104,174,192,214],"model":[60,138],"target":[62,77],"domain.":[64],"The":[65],"decoupling":[66],"design":[67],"enables":[68],"\"speaking\"":[70],"module":[71],"use":[73],"massive":[74],"amount":[75],"relieves":[81,98],"required":[85],"for":[86,101],"\"conversion\"":[88],"module.":[89],"Conversion":[90],"bridge":[93],"also":[97],"requirement":[100],"text":[105],"transcriptions":[106],"unlocks":[108],"usage":[110],"language":[112],"pre-training":[113],"technology":[114],"further":[116],"efficiently":[117],"reduce":[118,127],"data.":[125,216],"To":[126],"complexity":[129],"latency":[131],"\"speaking\",":[133],"single-stage":[135],"AR":[136],"designed":[140],"achieve":[142],"good":[143],"quality":[144],"as":[145,147],"well":[146],"lower":[148],"computation":[149],"cost.":[150],"Experiments":[151],"Indian-English":[153],"general":[155],"American-English":[156],"show":[158],"that":[159,197],"proposed":[161],"achieves":[163],"state-of-the-art":[164],"performance":[165],"similarity,":[168],"quality,":[170],"speaker":[172],"maintenance":[173],"15":[176],"minutes":[177],"weakly":[179],"not":[184],"constrained":[185],"same":[188],"speaker.":[189],"Extensive":[190],"experimentation":[191],"diverse":[193],"types":[195],"suggests":[196],"this":[198],"possesses":[200],"high":[202],"degree":[203],"adaptability,":[205],"making":[206],"it":[207],"readily":[208],"scalable":[209],"accommodate":[211],"other":[212],"accents":[213],"low-resource":[215],"Audio":[217],"samples":[218],"are":[219],"available":[220],"at":[221],"https://www.microsoft.com/en-us/research/project/convert-and-speak-zero-shot-accent-conversion-with-minimumsupervision/.":[222]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2025-12-27T23:08:20.325037","created_date":"2025-10-10T00:00:00"}
