{"id":"https://openalex.org/W4388642341","doi":"https://doi.org/10.1109/taslp.2023.3332542","title":"Decoupling and Interacting Multi-Task Learning Network for Joint Speech and Accent Recognition","display_name":"Decoupling and Interacting Multi-Task Learning Network for Joint Speech and Accent Recognition","publication_year":2023,"publication_date":"2023-11-13","ids":{"openalex":"https://openalex.org/W4388642341","doi":"https://doi.org/10.1109/taslp.2023.3332542"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2023.3332542","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3332542","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2311.07062","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5018773485","display_name":"Qijie Shao","orcid":"https://orcid.org/0009-0000-2145-4077"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Qijie Shao","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP), School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP), School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101783173","display_name":"Pengcheng Guo","orcid":"https://orcid.org/0009-0001-2388-5935"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pengcheng Guo","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP), School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP), School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062247066","display_name":"Jinghao Yan","orcid":"https://orcid.org/0009-0008-6845-9201"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinghao Yan","raw_affiliation_strings":["Tencent Research, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tencent Research, Beijing, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100703624","display_name":"Pengfei Hu","orcid":"https://orcid.org/0009-0000-4537-6288"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pengfei Hu","raw_affiliation_strings":["Tencent Research, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tencent Research, Beijing, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100668966","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-8234-0823"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["Audio, Speech and Language Processing Group (ASLP), School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Language Processing Group (ASLP), School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5018773485"],"corresponding_institution_ids":["https://openalex.org/I17145004"],"apc_list":null,"apc_paid":null,"fwci":2.3365,"has_fulltext":true,"cited_by_count":13,"citation_normalized_percentile":{"value":0.90769641,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":"32","issue":null,"first_page":"459","last_page":"470"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8050535917282104},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.70845627784729},{"id":"https://openalex.org/keywords/pronunciation","display_name":"Pronunciation","score":0.5936545729637146},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.48512253165245056},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.48246365785598755},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4031457006931305},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.08053851127624512}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8050535917282104},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.70845627784729},{"id":"https://openalex.org/C2780844864","wikidata":"https://www.wikidata.org/wiki/Q184377","display_name":"Pronunciation","level":2,"score":0.5936545729637146},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.48512253165245056},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48246365785598755},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4031457006931305},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.08053851127624512},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/taslp.2023.3332542","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3332542","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2311.07062","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2311.07062","pdf_url":"https://arxiv.org/pdf/2311.07062","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2311.07062","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2311.07062","pdf_url":"https://arxiv.org/pdf/2311.07062","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7200000286102295,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4388642341.pdf"},"referenced_works_count":58,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2101066392","https://openalex.org/W2117671523","https://openalex.org/W2127141656","https://openalex.org/W2194775991","https://openalex.org/W2294933947","https://openalex.org/W2514969556","https://openalex.org/W2566925314","https://openalex.org/W2603679025","https://openalex.org/W2766219058","https://openalex.org/W2794448670","https://openalex.org/W2889494795","https://openalex.org/W2950488390","https://openalex.org/W2962684181","https://openalex.org/W2962778134","https://openalex.org/W2962784628","https://openalex.org/W2962961016","https://openalex.org/W2963850025","https://openalex.org/W2964099675","https://openalex.org/W2964309797","https://openalex.org/W2973094925","https://openalex.org/W3012168345","https://openalex.org/W3023019055","https://openalex.org/W3023579893","https://openalex.org/W3024869864","https://openalex.org/W3035422681","https://openalex.org/W3036601975","https://openalex.org/W3044481399","https://openalex.org/W3095311338","https://openalex.org/W3096758241","https://openalex.org/W3097777922","https://openalex.org/W3130092735","https://openalex.org/W3151526698","https://openalex.org/W3160475509","https://openalex.org/W3161674998","https://openalex.org/W3162061711","https://openalex.org/W3162508345","https://openalex.org/W3163487528","https://openalex.org/W3197242296","https://openalex.org/W3197478142","https://openalex.org/W3197530164","https://openalex.org/W3197561413","https://openalex.org/W3200084433","https://openalex.org/W3209519721","https://openalex.org/W4210423654","https://openalex.org/W4224861836","https://openalex.org/W4292387508","https://openalex.org/W4296070193","https://openalex.org/W4301523212","https://openalex.org/W4319586049","https://openalex.org/W4319586290","https://openalex.org/W4372266510","https://openalex.org/W4385245566","https://openalex.org/W6739901393","https://openalex.org/W6749761576","https://openalex.org/W6780218876","https://openalex.org/W6790518089","https://openalex.org/W6803401863"],"related_works":["https://openalex.org/W2183593636","https://openalex.org/W2350724007","https://openalex.org/W2355751417","https://openalex.org/W2423284978","https://openalex.org/W2083922162","https://openalex.org/W2000075989","https://openalex.org/W4220683390","https://openalex.org/W2776838583","https://openalex.org/W2359469050","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Accents":[0],"pose":[1],"significant":[2],"challenges":[3],"for":[4,48,89,157,167],"speech":[5,11,91],"recognition":[6,12,16],"systems.":[7],"Although":[8],"joint":[9,90],"automatic":[10],"(ASR)":[13],"and":[14,63,84,92,111,118,126,146,186,206,216,240],"accent":[15,41,93,172],"(AR)":[17],"training":[18],"has":[19],"been":[20],"proven":[21],"effective":[22],"in":[23],"handling":[24],"multi-accent":[25],"scenarios,":[26],"current":[27],"multi-task":[28],"ASR-AR":[29],"approaches":[30],"overlook":[31],"the":[32,65,147,158,161,168,182,200,204,210,220,223],"granularity":[33],"differences":[34],"between":[35],"tasks.":[36],"Fine-grained":[37],"units":[38,45,129],"capture":[39],"pronunciation-related":[40],"characteristics,":[42],"while":[43,171],"coarse-grained":[44],"are":[46,120,179],"better":[47],"learning":[49],"linguistic":[50],"information.":[51],"Moreover,":[52],"an":[53,105,108,151],"explicit":[54],"interaction":[55],"of":[56,98,222],"two":[57],"tasks":[58],"can":[59],"provide":[60],"complementary":[61,201],"information":[62,202],"improve":[64],"other's":[66],"performance,":[67],"but":[68],"it":[69],"is":[70,96,137,150,196],"rarely":[71],"used":[72],"by":[73,123],"existing":[74],"approaches.":[75],"In":[76],"this":[77],"paper,":[78],"we":[79],"propose":[80],"a":[81,99,112,192,256],"novel":[82],"Decoupling":[83],"Interacting":[85],"Multi-task":[86],"Network":[87],"(DIMNet)":[88],"recognition,":[94],"which":[95,225],"comprised":[97],"connectionist":[100],"temporal":[101],"classification":[102],"(CTC)":[103],"branch,":[104,107,110],"AR":[106,117,135,144,169,177,236],"ASR":[109,119,148,183,190,250],"bottom":[113],"feature":[114],"encoder.":[115],"Specifically,":[116],"first":[121],"decoupled":[122],"separated":[124],"branches":[125],"two-granular":[127],"modeling":[128],"to":[130,198],"learn":[131],"task-specific":[132],"representations.":[133],"The":[134],"branch":[136,149,163],"from":[138,175,203],"our":[139,176],"previously":[140],"proposed":[141],"linguistic-acoustic":[142],"bimodal":[143],"model":[145,178],"encoder-decoder":[152],"based":[153],"Conformer":[154],"model.":[155],"Then,":[156],"task":[159],"interaction,":[160],"CTC":[162,205],"provides":[164],"aligned":[165],"text":[166],"task,":[170],"embeddings":[173],"extracted":[174],"incorporated":[180],"into":[181],"branch's":[184],"encoder":[185],"decoder.":[187],"Finally,":[188],"during":[189],"inference,":[191],"cross-granular":[193],"rescoring":[194],"method":[195],"introduced":[197],"fuse":[199],"attention":[207],"decoder":[208],"after":[209],"decoupling.":[211],"Our":[212],"experiments":[213],"on":[214],"English":[215],"Chinese":[217],"datasets":[218],"demonstrate":[219],"effectiveness":[221],"DIMNet,":[224],"achieves":[226],"<inline-formula":[227,232,241,246],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[228,233,242,247],"xmlns:xlink=\"http://www.w3.org/1999/xlink\"><tex-math":[229,234,243,248],"notation=\"LaTeX\">${21.45\\%}$</tex-math></inline-formula>":[230],"/":[231,245],"notation=\"LaTeX\">${28.53\\%}$</tex-math></inline-formula>":[235],"accuracy":[237],"relative":[238,253],"improvement":[239],"notation=\"LaTeX\">${32.33\\%}$</tex-math></inline-formula>":[244],"notation=\"LaTeX\">${14.55\\%}$</tex-math></inline-formula>":[249],"error":[251],"rate":[252],"reduction":[254],"over":[255],"published":[257],"standard":[258],"baseline,":[259],"respectively.":[260]},"counts_by_year":[{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":5}],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2023-11-14T00:00:00"}
