{"id":"https://openalex.org/W4395471007","doi":"https://doi.org/10.1109/taslp.2024.3393732","title":"Optimizing Audio-Visual Speech Enhancement Using Multi-Level Distortion Measures for Audio-Visual Speech Recognition","display_name":"Optimizing Audio-Visual Speech Enhancement Using Multi-Level Distortion Measures for Audio-Visual Speech Recognition","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4395471007","doi":"https://doi.org/10.1109/taslp.2024.3393732"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3393732","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3393732","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5029478820","display_name":"Hang Chen","orcid":"https://orcid.org/0000-0002-0904-8946"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hang Chen","raw_affiliation_strings":["National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100434900","display_name":"Qing Wang","orcid":"https://orcid.org/0000-0003-3843-3920"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qing Wang","raw_affiliation_strings":["National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066595711","display_name":"Jun Du","orcid":"https://orcid.org/0000-0002-2387-0389"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Du","raw_affiliation_strings":["National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059859891","display_name":"Baocai Yin","orcid":"https://orcid.org/0000-0002-4164-6647"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bao-Cai Yin","raw_affiliation_strings":["iFLYTEK Research, Hefei, China"],"affiliations":[{"raw_affiliation_string":"iFLYTEK Research, Hefei, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101924451","display_name":"Jia Pan","orcid":"https://orcid.org/0000-0002-7073-1744"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jia Pan","raw_affiliation_strings":["iFLYTEK Research, Hefei, China"],"affiliations":[{"raw_affiliation_string":"iFLYTEK Research, Hefei, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066868860","display_name":"Chin\u2010Hui Lee","orcid":"https://orcid.org/0000-0002-1892-2551"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chin-Hui Lee","raw_affiliation_strings":["School of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"School of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5029478820"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":2.6252,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.90389763,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":"32","issue":null,"first_page":"2508","last_page":"2521"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11233","display_name":"Advanced Adaptive Filtering Techniques","score":0.9961000084877014,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7108110189437866},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.691737174987793},{"id":"https://openalex.org/keywords/intelligibility","display_name":"Intelligibility (philosophy)","score":0.5980758666992188},{"id":"https://openalex.org/keywords/correlation","display_name":"Correlation","score":0.550075888633728},{"id":"https://openalex.org/keywords/generalizability-theory","display_name":"Generalizability theory","score":0.4763524532318115},{"id":"https://openalex.org/keywords/logarithm","display_name":"Logarithm","score":0.4509177505970001},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.41923877596855164},{"id":"https://openalex.org/keywords/entropy","display_name":"Entropy (arrow of time)","score":0.41514983773231506},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.39406251907348633},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.37996912002563477},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.17292195558547974},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.153550922870636}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7108110189437866},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.691737174987793},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.5980758666992188},{"id":"https://openalex.org/C117220453","wikidata":"https://www.wikidata.org/wiki/Q5172842","display_name":"Correlation","level":2,"score":0.550075888633728},{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.4763524532318115},{"id":"https://openalex.org/C39927690","wikidata":"https://www.wikidata.org/wiki/Q11197","display_name":"Logarithm","level":2,"score":0.4509177505970001},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.41923877596855164},{"id":"https://openalex.org/C106301342","wikidata":"https://www.wikidata.org/wiki/Q4117933","display_name":"Entropy (arrow of time)","level":2,"score":0.41514983773231506},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39406251907348633},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.37996912002563477},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.17292195558547974},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.153550922870636},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2024.3393732","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3393732","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.41999998688697815,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[{"id":"https://openalex.org/G1713836045","display_name":null,"funder_award_id":"62171427","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":83,"referenced_works":["https://openalex.org/W160800111","https://openalex.org/W1488856050","https://openalex.org/W1498436455","https://openalex.org/W1552314771","https://openalex.org/W1573024203","https://openalex.org/W1902954812","https://openalex.org/W1968518983","https://openalex.org/W1969975883","https://openalex.org/W1989034586","https://openalex.org/W2015394094","https://openalex.org/W2029199293","https://openalex.org/W2038010270","https://openalex.org/W2044893557","https://openalex.org/W2062164080","https://openalex.org/W2069681747","https://openalex.org/W2070316439","https://openalex.org/W2099731107","https://openalex.org/W2100495367","https://openalex.org/W2106488367","https://openalex.org/W2115086266","https://openalex.org/W2128653836","https://openalex.org/W2141998673","https://openalex.org/W2146324387","https://openalex.org/W2153038597","https://openalex.org/W2158501878","https://openalex.org/W2168379380","https://openalex.org/W2194775991","https://openalex.org/W2239141610","https://openalex.org/W2278884322","https://openalex.org/W2302255633","https://openalex.org/W2398042854","https://openalex.org/W2400339399","https://openalex.org/W2517616541","https://openalex.org/W2551572271","https://openalex.org/W2584390378","https://openalex.org/W2749337587","https://openalex.org/W2788241093","https://openalex.org/W2805233667","https://openalex.org/W2892129657","https://openalex.org/W2899829828","https://openalex.org/W2937998856","https://openalex.org/W2952218014","https://openalex.org/W2959214850","https://openalex.org/W2962785568","https://openalex.org/W2962892438","https://openalex.org/W2962960500","https://openalex.org/W2963045393","https://openalex.org/W2963082324","https://openalex.org/W2963218389","https://openalex.org/W2964171275","https://openalex.org/W2964207404","https://openalex.org/W2973049979","https://openalex.org/W2973133192","https://openalex.org/W2987989623","https://openalex.org/W3004146833","https://openalex.org/W3008400075","https://openalex.org/W3011424113","https://openalex.org/W3015785290","https://openalex.org/W3097934054","https://openalex.org/W3099330747","https://openalex.org/W3103003256","https://openalex.org/W3103619082","https://openalex.org/W3103902067","https://openalex.org/W3116298410","https://openalex.org/W3147539069","https://openalex.org/W3161480375","https://openalex.org/W3168662520","https://openalex.org/W3197284240","https://openalex.org/W3197912330","https://openalex.org/W3205012673","https://openalex.org/W4214698081","https://openalex.org/W4237532635","https://openalex.org/W4237742002","https://openalex.org/W4245919820","https://openalex.org/W4289665794","https://openalex.org/W4291920479","https://openalex.org/W4297841387","https://openalex.org/W4372260342","https://openalex.org/W4372347394","https://openalex.org/W6631190155","https://openalex.org/W6675117242","https://openalex.org/W6762114000","https://openalex.org/W6762503783"],"related_works":["https://openalex.org/W2118717649","https://openalex.org/W2413243053","https://openalex.org/W410723623","https://openalex.org/W2015341305","https://openalex.org/W2035068594","https://openalex.org/W4225593417","https://openalex.org/W2573498121","https://openalex.org/W3022298670","https://openalex.org/W3160494304","https://openalex.org/W2388888344"],"abstract_inverted_index":{"A":[0],"multi-level":[1,99],"distortion":[2,100],"measure":[3,101,114],"(MLDM)":[4],"is":[5,26,175],"proposed":[6],"as":[7,104],"an":[8,111],"objective":[9,138],"to":[10,27],"optimize":[11],"deep":[12],"neural":[13],"network-based":[14],"speech":[15,33,152],"enhancement":[16],"(SE)":[17],"in":[18,32,139,148,163],"both":[19,140,171],"audio-only":[20,143],"and":[21,36,72,79,95,110,142,155,173,186],"audio-visual":[22,141],"scenarios.":[23],"The":[24,189],"aim":[25],"achieve":[28],"simultaneous":[29],"performance":[30],"improvements":[31],"quality,":[34,153],"intelligibility,":[35,154],"recognition":[37,156],"error":[38,67],"reductions.":[39],"Moreover,":[40],"a":[41,105],"comprehensive":[42],"correlation":[43,54,113],"analysis":[44],"shows":[45],"that":[46,131],"these":[47],"three":[48,59,93,118,150],"evaluation":[49],"metrics":[50],"exhibit":[51],"high":[52],"Pearson":[53],"coefficient":[55],"(PCC)":[56],"values":[57],"with":[58,136],"commonly":[60],"used":[61],"optimization":[62],"objectives:":[63],"the":[64,69,85,89,92,117,123,168],"mean":[65],"squared":[66],"between":[68],"ideal":[70],"ratio":[71],"estimated":[73],"magnitude":[74],"masks,":[75],"scale-invariant":[76],"signal-to-noise":[77],"ratio,":[78],"cross-entropy-guided":[80],"measure.":[81],"To":[82],"further":[83],"improve":[84],"performance,":[86],"we":[87],"leverage":[88],"complementarities":[90],"of":[91,108,170],"objectives":[94],"propose":[96],"another":[97],"correlated":[98],"(C-MLDM)":[102],"defined":[103],"weighted":[106],"combination":[107],"MLDM":[109,132,162,172],"average":[112],"based":[115],"on":[116,122],"PCCs.":[119],"Experimental":[120],"results":[121],"TCD-TIMIT":[124],"corpus":[125],"corrupted":[126],"by":[127],"additive":[128],"noise":[129],"demonstrate":[130],"outperforms":[133,161],"systems":[134],"optimized":[135],"each":[137],"scenarios,":[144],"offering":[145],"improved":[146],"performances":[147],"all":[149,164],"metrics:":[151],"performance.":[157],"C-MLDM":[158,174],"also":[159],"consistently":[160],"test":[165],"cases.":[166],"Finally,":[167],"generalizability":[169],"confirmed":[176],"through":[177],"extensive":[178],"testing":[179],"across":[180],"diverse":[181],"datasets,":[182],"SE":[183],"model":[184],"architectures,":[185],"linguistic":[187],"conditions.":[188],"source":[190],"codes":[191],"are":[192],"publicly":[193],"available.":[194],"<sup":[195],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[196],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[197]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
