{"id":"https://openalex.org/W4372270165","doi":"https://doi.org/10.1109/icassp49357.2023.10096565","title":"A Multi-Scale Feature Aggregation Based Lightweight Network for Audio-Visual Speech Enhancement","display_name":"A Multi-Scale Feature Aggregation Based Lightweight Network for Audio-Visual Speech Enhancement","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372270165","doi":"https://doi.org/10.1109/icassp49357.2023.10096565"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10096565","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096565","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5008599015","display_name":"Haitao Xu","orcid":"https://orcid.org/0000-0001-9242-262X"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Haitao Xu","raw_affiliation_strings":["University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111503656","display_name":"Liangfa Wei","orcid":null},"institutions":[{"id":"https://openalex.org/I4210121368","display_name":"Machine Science","ror":"https://ror.org/02hrr9v50","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210121368"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Liangfa Wei","raw_affiliation_strings":["Machine Learning Group,Ethereal Audio Lab,Shenzhen,China","Ethereal Audio Lab, Machine Learning Group, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Machine Learning Group,Ethereal Audio Lab,Shenzhen,China","institution_ids":["https://openalex.org/I4210121368"]},{"raw_affiliation_string":"Ethereal Audio Lab, Machine Learning Group, Shenzhen, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100436848","display_name":"Jie Zhang","orcid":"https://orcid.org/0000-0003-1124-0854"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Zhang","raw_affiliation_strings":["University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100990693","display_name":"Jianming Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210114105","display_name":"Tsinghua\u2013Berkeley Shenzhen Institute","ror":"https://ror.org/02hhwwz98","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210114105","https://openalex.org/I95457486","https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianming Yang","raw_affiliation_strings":["Tsinghua Shenzhen International Graduate School,Shenzhen,China","Tsinghua Shenzhen International Graduate School, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua Shenzhen International Graduate School,Shenzhen,China","institution_ids":["https://openalex.org/I4210114105"]},{"raw_affiliation_string":"Tsinghua Shenzhen International Graduate School, Shenzhen, China","institution_ids":["https://openalex.org/I4210114105"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084128157","display_name":"Yannan Wang","orcid":"https://orcid.org/0000-0001-7248-4954"},"institutions":[{"id":"https://openalex.org/I4210121368","display_name":"Machine Science","ror":"https://ror.org/02hrr9v50","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210121368"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yannan Wang","raw_affiliation_strings":["Machine Learning Group,Ethereal Audio Lab,Shenzhen,China","Ethereal Audio Lab, Machine Learning Group, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Machine Learning Group,Ethereal Audio Lab,Shenzhen,China","institution_ids":["https://openalex.org/I4210121368"]},{"raw_affiliation_string":"Ethereal Audio Lab, Machine Learning Group, Shenzhen, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101751664","display_name":"Tian Gao","orcid":"https://orcid.org/0000-0002-6987-5046"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tian Gao","raw_affiliation_strings":["University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101716884","display_name":"Xin Fang","orcid":"https://orcid.org/0000-0003-4796-9444"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Fang","raw_affiliation_strings":["University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5110935514","display_name":"Lirong Dai","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lirong Dai","raw_affiliation_strings":["University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5008599015"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.8151,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.70134585,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11233","display_name":"Advanced Adaptive Filtering Techniques","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8167287707328796},{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.729189932346344},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.6107543706893921},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5552998185157776},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.5532417893409729},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5417592525482178},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.4620342552661896},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.45424920320510864},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3834494352340698},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.12796705961227417}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8167287707328796},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.729189932346344},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.6107543706893921},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5552998185157776},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.5532417893409729},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5417592525482178},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.4620342552661896},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.45424920320510864},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3834494352340698},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.12796705961227417},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10096565","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096565","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":40,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1552314771","https://openalex.org/W1974387177","https://openalex.org/W2015143272","https://openalex.org/W2044893557","https://openalex.org/W2067295501","https://openalex.org/W2069681747","https://openalex.org/W2115252128","https://openalex.org/W2123921160","https://openalex.org/W2159202424","https://openalex.org/W2194775991","https://openalex.org/W2770119437","https://openalex.org/W2795280371","https://openalex.org/W2900292050","https://openalex.org/W2935794029","https://openalex.org/W2963082324","https://openalex.org/W2964171275","https://openalex.org/W2964207404","https://openalex.org/W3011424113","https://openalex.org/W3015785290","https://openalex.org/W3024147341","https://openalex.org/W3028392891","https://openalex.org/W3047388929","https://openalex.org/W3097653961","https://openalex.org/W3097817799","https://openalex.org/W3136499730","https://openalex.org/W3157352387","https://openalex.org/W3160310387","https://openalex.org/W4214567298","https://openalex.org/W4221149546","https://openalex.org/W4245919820","https://openalex.org/W4253928870","https://openalex.org/W4283782204","https://openalex.org/W4289665794","https://openalex.org/W4394640090","https://openalex.org/W6631190155","https://openalex.org/W6677618333","https://openalex.org/W6746278845","https://openalex.org/W6761392404","https://openalex.org/W6781364056"],"related_works":["https://openalex.org/W2953234277","https://openalex.org/W2626256601","https://openalex.org/W147410782","https://openalex.org/W2900413183","https://openalex.org/W4390975304","https://openalex.org/W3022252430","https://openalex.org/W4287804464","https://openalex.org/W3103989898","https://openalex.org/W3211292372","https://openalex.org/W803346624"],"abstract_inverted_index":{"Audio-visual":[0],"speech":[1,16],"enhancement":[2],"(AVSE)":[3],"was":[4],"shown":[5,88],"to":[6,95,146],"be":[7],"superior":[8],"over":[9],"conventional":[10],"audio-only":[11],"counterpart":[12],"for":[13,34,65,104],"improving":[14],"the":[15,26,35,66,96,105,116,147],"quality.":[17],"However,":[18],"most":[19],"existing":[20,131],"AVSE":[21,48],"models":[22],"are":[23,63,126],"heavyweight":[24,148],"in":[25,128],"sense":[27],"of":[28],"parameter":[29],"count,":[30],"which":[31,109],"is":[32,87],"inappropriate":[33],"deployment":[36],"and":[37,57,68,79,133],"practical":[38],"applications.":[39],"In":[40],"this":[41],"paper,":[42],"we":[43,99],"therefore":[44],"present":[45],"a":[46,111,137,141],"lightweight":[47],"approach":[49],"(called":[50],"M3Net)":[51],"by":[52],"incorporating":[53],"several":[54],"multi-modality,":[55],"multi-scale":[56,61,72,76,80,91,118],"multi-branch":[58],"strategies.":[59],"Three":[60],"techniques":[62,125],"designed":[64,117],"visual":[67],"audio":[69],"streams,":[70],"including":[71],"average":[73],"pooling":[74],"(MSAP),":[75],"ResNet":[77],"(MSResNet)":[78],"short":[81],"time":[82],"Fourier":[83],"transform":[84],"(MSSTFT).":[85],"It":[86],"that":[89,123],"each":[90],"module":[92],"positively":[93],"contributes":[94],"performance.":[97],"Also,":[98],"consider":[100],"four":[101],"skip":[102],"connections":[103],"audio-visual":[106],"feature":[107],"aggregation,":[108],"have":[110],"great":[112],"complementary":[113],"effect":[114],"on":[115],"techniques.":[119],"Experimental":[120],"results":[121],"show":[122],"these":[124],"flexible":[127],"combination":[129],"with":[130,140],"approaches,":[132],"more":[134],"importantly":[135],"obtain":[136],"comparable":[138],"performance":[139],"smaller":[142],"model":[143],"size":[144],"compared":[145],"networks.":[149]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
