{"id":"https://openalex.org/W7123908307","doi":"https://doi.org/10.1109/mmsp64401.2025.11324371","title":"Frequency-Weighted Training Losses for Phoneme-Level DNN-based Speech Enhancement","display_name":"Frequency-Weighted Training Losses for Phoneme-Level DNN-based Speech Enhancement","publication_year":2025,"publication_date":"2025-09-21","ids":{"openalex":"https://openalex.org/W7123908307","doi":"https://doi.org/10.1109/mmsp64401.2025.11324371"},"language":null,"primary_location":{"id":"doi:10.1109/mmsp64401.2025.11324371","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp64401.2025.11324371","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093793090","display_name":"Nasser-Eddine Monir","orcid":"https://orcid.org/0009-0006-7531-2051"},"institutions":[{"id":"https://openalex.org/I4210121838","display_name":"Laboratoire Lorrain de Recherche en Informatique et ses Applications","ror":"https://ror.org/02vnf0c38","country_code":"FR","type":"facility","lineage":["https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I1326498283","https://openalex.org/I277688954","https://openalex.org/I4210107720","https://openalex.org/I4210121838","https://openalex.org/I4210159245","https://openalex.org/I90183372"]},{"id":"https://openalex.org/I1294671590","display_name":"Centre National de la Recherche Scientifique","ror":"https://ror.org/02feahw73","country_code":"FR","type":"funder","lineage":["https://openalex.org/I1294671590"]}],"countries":["FR"],"is_corresponding":true,"raw_author_name":"Nasser-Eddine Monir","raw_affiliation_strings":["Universit&#x00E9; de Lorraine,CNRS, Inria, Loria,Nancy,France"],"affiliations":[{"raw_affiliation_string":"Universit&#x00E9; de Lorraine,CNRS, Inria, Loria,Nancy,France","institution_ids":["https://openalex.org/I4210121838","https://openalex.org/I1294671590"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067843269","display_name":"Paul Magron","orcid":"https://orcid.org/0000-0002-8561-0961"},"institutions":[{"id":"https://openalex.org/I4210121838","display_name":"Laboratoire Lorrain de Recherche en Informatique et ses Applications","ror":"https://ror.org/02vnf0c38","country_code":"FR","type":"facility","lineage":["https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I1326498283","https://openalex.org/I277688954","https://openalex.org/I4210107720","https://openalex.org/I4210121838","https://openalex.org/I4210159245","https://openalex.org/I90183372"]},{"id":"https://openalex.org/I1294671590","display_name":"Centre National de la Recherche Scientifique","ror":"https://ror.org/02feahw73","country_code":"FR","type":"funder","lineage":["https://openalex.org/I1294671590"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Paul Magron","raw_affiliation_strings":["Universit&#x00E9; de Lorraine,CNRS, Inria, Loria,Nancy,France"],"affiliations":[{"raw_affiliation_string":"Universit&#x00E9; de Lorraine,CNRS, Inria, Loria,Nancy,France","institution_ids":["https://openalex.org/I4210121838","https://openalex.org/I1294671590"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5122938031","display_name":"Romain Serizel","orcid":null},"institutions":[{"id":"https://openalex.org/I1294671590","display_name":"Centre National de la Recherche Scientifique","ror":"https://ror.org/02feahw73","country_code":"FR","type":"funder","lineage":["https://openalex.org/I1294671590"]},{"id":"https://openalex.org/I4210121838","display_name":"Laboratoire Lorrain de Recherche en Informatique et ses Applications","ror":"https://ror.org/02vnf0c38","country_code":"FR","type":"facility","lineage":["https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I1326498283","https://openalex.org/I277688954","https://openalex.org/I4210107720","https://openalex.org/I4210121838","https://openalex.org/I4210159245","https://openalex.org/I90183372"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Romain Serizel","raw_affiliation_strings":["Universit&#x00E9; de Lorraine,CNRS, Inria, Loria,Nancy,France"],"affiliations":[{"raw_affiliation_string":"Universit&#x00E9; de Lorraine,CNRS, Inria, Loria,Nancy,France","institution_ids":["https://openalex.org/I4210121838","https://openalex.org/I1294671590"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5093793090"],"corresponding_institution_ids":["https://openalex.org/I1294671590","https://openalex.org/I4210121838"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.6992706,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"310","last_page":"315"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9776999950408936,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9776999950408936,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.00839999970048666,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.0038999998942017555,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.847000002861023},{"id":"https://openalex.org/keywords/weighting","display_name":"Weighting","score":0.659600019454956},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.48350000381469727},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.47209998965263367},{"id":"https://openalex.org/keywords/consonant","display_name":"Consonant","score":0.4530999958515167},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.387800008058548},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.3806999921798706},{"id":"https://openalex.org/keywords/background-noise","display_name":"Background noise","score":0.3677999973297119},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.353300005197525}],"concepts":[{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.847000002861023},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7192999720573425},{"id":"https://openalex.org/C183115368","wikidata":"https://www.wikidata.org/wiki/Q856577","display_name":"Weighting","level":2,"score":0.659600019454956},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.628000020980835},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.48350000381469727},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.47209998965263367},{"id":"https://openalex.org/C2778203577","wikidata":"https://www.wikidata.org/wiki/Q38035","display_name":"Consonant","level":3,"score":0.4530999958515167},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.387800008058548},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3806999921798706},{"id":"https://openalex.org/C100675267","wikidata":"https://www.wikidata.org/wiki/Q1371624","display_name":"Background noise","level":2,"score":0.3677999973297119},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3546999990940094},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.353300005197525},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.34529998898506165},{"id":"https://openalex.org/C19118579","wikidata":"https://www.wikidata.org/wiki/Q786423","display_name":"Frequency domain","level":2,"score":0.34310001134872437},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.33899998664855957},{"id":"https://openalex.org/C13944312","wikidata":"https://www.wikidata.org/wiki/Q7512748","display_name":"Signal-to-noise ratio (imaging)","level":2,"score":0.32339999079704285},{"id":"https://openalex.org/C99209842","wikidata":"https://www.wikidata.org/wiki/Q643696","display_name":"Speech perception","level":3,"score":0.31940001249313354},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.310699999332428},{"id":"https://openalex.org/C103824480","wikidata":"https://www.wikidata.org/wiki/Q185889","display_name":"Time domain","level":2,"score":0.30730000138282776},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.3057999908924103},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.29319998621940613},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.29190000891685486},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.25839999318122864},{"id":"https://openalex.org/C168110828","wikidata":"https://www.wikidata.org/wiki/Q1331626","display_name":"Spectral density","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.25049999356269836}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/mmsp64401.2025.11324371","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mmsp64401.2025.11324371","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Workshop on Multimedia Signal Processing (MMSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1998648683","https://openalex.org/W2026829282","https://openalex.org/W2050758723","https://openalex.org/W2058079016","https://openalex.org/W2066218102","https://openalex.org/W2103869314","https://openalex.org/W2127851351","https://openalex.org/W2139916917","https://openalex.org/W2141998673","https://openalex.org/W2398042854","https://openalex.org/W2763188033","https://openalex.org/W2799963040","https://openalex.org/W2964058413","https://openalex.org/W2990078986","https://openalex.org/W3004309045","https://openalex.org/W3015372568","https://openalex.org/W3095717210","https://openalex.org/W3173852682","https://openalex.org/W3209472002","https://openalex.org/W4210849719","https://openalex.org/W4226071457","https://openalex.org/W4297841603","https://openalex.org/W4312701366","https://openalex.org/W4386102192","https://openalex.org/W4401609190","https://openalex.org/W4405324209","https://openalex.org/W4411119792"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,47],"deep":[3],"learning":[4],"have":[5],"significantly":[6],"improved":[7],"multichannel":[8,107],"speech":[9,66,100,108],"enhancement":[10,109],"algorithms,":[11],"yet":[12],"conventional":[13],"training":[14],"loss":[15],"functions":[16],"such":[17,122],"as":[18,123],"the":[19,43,48,71,96,105,124],"scale-invariant":[20],"signal-to-distortion":[21],"ratio":[22],"(SDR)":[23],"may":[24],"fail":[25],"to":[26,61,150],"preserve":[27],"fine-grained":[28],"spectral":[29,88,140],"cues":[30],"essential":[31],"for":[32],"phoneme":[33],"intelligibility.":[34],"In":[35],"this":[36],"work,":[37],"we":[38],"propose":[39],"perceptually-informed":[40],"variants":[41],"of":[42,99,154],"SDR":[44,125],"loss,":[45],"formulated":[46],"time-frequency":[49,63],"domain":[50],"and":[51,81,91,101,141],"modulated":[52],"by":[53],"frequency-dependent":[54],"weighting":[55,93],"schemes.":[56],"These":[57],"weights":[58],"are":[59,126],"designed":[60],"emphasize":[62],"regions":[64],"where":[65,70],"is":[67,74],"prominent":[68],"or":[69],"interfering":[72],"noise":[73],"particularly":[75],"strong.":[76],"We":[77,103],"investigate":[78],"both":[79],"fixed":[80],"adaptive":[82],"strategies,":[83],"including":[84],"ANSI":[85],"band-importance":[86],"weights,":[87],"magnitude-based":[89],"weighting,":[90],"dynamic":[92],"based":[94],"on":[95],"relative":[97],"amount":[98],"noise.":[102],"train":[104],"FaSNet":[106],"model":[110],"using":[111],"these":[112],"various":[113],"losses.":[114],"Experimental":[115],"results":[116],"show":[117],"that":[118],"while":[119],"standard":[120],"metrics":[121],"only":[127],"marginally":[128],"improved,":[129],"their":[130],"perceptual":[131],"frequency-weighted":[132],"counterparts":[133],"exhibit":[134],"a":[135,151],"more":[136],"substantial":[137],"improvement.":[138],"Besides,":[139],"phoneme-level":[142],"analysis":[143],"indicates":[144],"better":[145,152],"consonant":[146],"reconstruction,":[147],"which":[148],"points":[149],"preservation":[153],"certain":[155],"acoustic":[156],"cues.":[157]},"counts_by_year":[],"updated_date":"2026-01-14T23:44:37.837170","created_date":"2026-01-14T00:00:00"}
