{"id":"https://openalex.org/W4415002978","doi":"https://doi.org/10.1109/access.2025.3619782","title":"Foundation Models for Speech Enhancement Leveraging Consistency Constraints and Contrast Stretching","display_name":"Foundation Models for Speech Enhancement Leveraging Consistency Constraints and Contrast Stretching","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4415002978","doi":"https://doi.org/10.1109/access.2025.3619782"},"language":"en","primary_location":{"id":"doi:10.1109/access.2025.3619782","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3619782","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1109/access.2025.3619782","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Muhammad Salman Khan","orcid":"https://orcid.org/0009-0006-7801-798X"},"institutions":[{"id":"https://openalex.org/I246010334","display_name":"Universit\u00e0 degli Studi di Enna Kore","ror":"https://ror.org/04vd28p53","country_code":"IT","type":"education","lineage":["https://openalex.org/I246010334"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Muhammad Salman Khan","raw_affiliation_strings":["University of Enna &#x201C;Kore,&#x201D;, Enna, Italy","University of Enna &#x201C;Kore&#x201D;, Enna, Italy"],"raw_orcid":"https://orcid.org/0009-0006-7801-798X","affiliations":[{"raw_affiliation_string":"University of Enna &#x201C;Kore,&#x201D;, Enna, Italy","institution_ids":["https://openalex.org/I246010334"]},{"raw_affiliation_string":"University of Enna &#x201C;Kore&#x201D;, Enna, Italy","institution_ids":["https://openalex.org/I246010334"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068293208","display_name":"Valerio Mario Salerno","orcid":"https://orcid.org/0000-0002-1048-7380"},"institutions":[{"id":"https://openalex.org/I246010334","display_name":"Universit\u00e0 degli Studi di Enna Kore","ror":"https://ror.org/04vd28p53","country_code":"IT","type":"education","lineage":["https://openalex.org/I246010334"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Valerio Mario Salerno","raw_affiliation_strings":["University of Enna &#x201C;Kore,&#x201D;, Enna, Italy","University of Enna &#x201C;Kore&#x201D;, Enna, Italy"],"raw_orcid":"https://orcid.org/0000-0002-1048-7380","affiliations":[{"raw_affiliation_string":"University of Enna &#x201C;Kore,&#x201D;, Enna, Italy","institution_ids":["https://openalex.org/I246010334"]},{"raw_affiliation_string":"University of Enna &#x201C;Kore&#x201D;, Enna, Italy","institution_ids":["https://openalex.org/I246010334"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078641012","display_name":"Moreno La Quatra","orcid":"https://orcid.org/0000-0001-8838-064X"},"institutions":[{"id":"https://openalex.org/I246010334","display_name":"Universit\u00e0 degli Studi di Enna Kore","ror":"https://ror.org/04vd28p53","country_code":"IT","type":"education","lineage":["https://openalex.org/I246010334"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Moreno la Quatra","raw_affiliation_strings":["University of Enna &#x201C;Kore,&#x201D;, Enna, Italy","University of Enna &#x201C;Kore&#x201D;, Enna, Italy"],"raw_orcid":"https://orcid.org/0000-0001-8838-064X","affiliations":[{"raw_affiliation_string":"University of Enna &#x201C;Kore,&#x201D;, Enna, Italy","institution_ids":["https://openalex.org/I246010334"]},{"raw_affiliation_string":"University of Enna &#x201C;Kore&#x201D;, Enna, Italy","institution_ids":["https://openalex.org/I246010334"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071855826","display_name":"Kuo-Hsuan Hung","orcid":"https://orcid.org/0009-0007-0974-4873"},"institutions":[{"id":"https://openalex.org/I84653119","display_name":"Academia Sinica","ror":"https://ror.org/05bxb3784","country_code":"TW","type":"facility","lineage":["https://openalex.org/I84653119"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Kuo-Hsuan Hung","raw_affiliation_strings":["Academia Sinica, Taipei, Taiwan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Academia Sinica, Taipei, Taiwan","institution_ids":["https://openalex.org/I84653119"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071471469","display_name":"Szu\u2010Wei Fu","orcid":"https://orcid.org/0000-0002-3487-8212"},"institutions":[{"id":"https://openalex.org/I1304085615","display_name":"Nvidia (United Kingdom)","ror":"https://ror.org/02kr42612","country_code":"GB","type":"company","lineage":["https://openalex.org/I1304085615","https://openalex.org/I4210127875"]},{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Szu-Wei Fu","raw_affiliation_strings":["Nvidia Corporation, Taipei, Taiwan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Nvidia Corporation, Taipei, Taiwan","institution_ids":["https://openalex.org/I4210127875","https://openalex.org/I1304085615"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044008055","display_name":"Yu Tsao","orcid":"https://orcid.org/0000-0001-6956-0418"},"institutions":[{"id":"https://openalex.org/I84653119","display_name":"Academia Sinica","ror":"https://ror.org/05bxb3784","country_code":"TW","type":"facility","lineage":["https://openalex.org/I84653119"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Yu Tsao","raw_affiliation_strings":["Academia Sinica, Taipei, Taiwan"],"raw_orcid":"https://orcid.org/0000-0001-6956-0418","affiliations":[{"raw_affiliation_string":"Academia Sinica, Taipei, Taiwan","institution_ids":["https://openalex.org/I84653119"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5079659476","display_name":"Sabato Marco Siniscalchi","orcid":"https://orcid.org/0000-0002-0770-0507"},"institutions":[{"id":"https://openalex.org/I900890020","display_name":"University of Palermo","ror":"https://ror.org/044k9ta02","country_code":"IT","type":"education","lineage":["https://openalex.org/I900890020"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Sabato Marco Siniscalchi","raw_affiliation_strings":["University of Palermo, Palermo, Italy"],"raw_orcid":"https://orcid.org/0000-0002-0770-0507","affiliations":[{"raw_affiliation_string":"University of Palermo, Palermo, Italy","institution_ids":["https://openalex.org/I900890020"]}]}],"institutions":[],"countries_distinct_count":4,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":{"value":1850,"currency":"USD","value_usd":1850},"apc_paid":{"value":1850,"currency":"USD","value_usd":1850},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.30106603,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"13","issue":null,"first_page":"175718","last_page":"175732"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9839000105857849,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9839000105857849,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.8167999982833862},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.7484999895095825},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6317999958992004},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.505299985408783},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.4943000078201294},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.47049999237060547},{"id":"https://openalex.org/keywords/contrast","display_name":"Contrast (vision)","score":0.43880000710487366},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4185999929904938},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4041999876499176}],"concepts":[{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.8167999982833862},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.7484999895095825},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7300999760627747},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6317999958992004},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5665000081062317},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.505299985408783},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.4943000078201294},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.47380000352859497},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.47049999237060547},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.43880000710487366},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4185999929904938},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4041999876499176},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.3799999952316284},{"id":"https://openalex.org/C73000952","wikidata":"https://www.wikidata.org/wiki/Q17007827","display_name":"Discretization","level":2,"score":0.37369999289512634},{"id":"https://openalex.org/C103734657","wikidata":"https://www.wikidata.org/wiki/Q2739975","display_name":"PESQ","level":4,"score":0.35690000653266907},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.3465000092983246},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.3449000120162964},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.33739998936653137},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.33489999175071716},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.33090001344680786},{"id":"https://openalex.org/C3018181011","wikidata":"https://www.wikidata.org/wiki/Q6849688","display_name":"Contrast enhancement","level":3,"score":0.3273000121116638},{"id":"https://openalex.org/C13944312","wikidata":"https://www.wikidata.org/wiki/Q7512748","display_name":"Signal-to-noise ratio (imaging)","level":2,"score":0.31540000438690186},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.3127000033855438},{"id":"https://openalex.org/C99209842","wikidata":"https://www.wikidata.org/wiki/Q643696","display_name":"Speech perception","level":3,"score":0.3093000054359436},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2953999936580658},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2946000099182129},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.27129998803138733},{"id":"https://openalex.org/C9940772","wikidata":"https://www.wikidata.org/wiki/Q557399","display_name":"Psychoacoustics","level":3,"score":0.2703000009059906},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2565999925136566},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.2556999921798706}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/access.2025.3619782","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3619782","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:1b5cc4af614646c58b4c92e0f8493561","is_oa":true,"landing_page_url":"https://doaj.org/article/1b5cc4af614646c58b4c92e0f8493561","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"IEEE Access, Vol 13, Pp 175718-175732 (2025)","raw_type":"article"},{"id":"pmh:oai:iris.unipa.it:10447/701132","is_oa":true,"landing_page_url":"https://hdl.handle.net/10447/701132","pdf_url":null,"source":{"id":"https://openalex.org/S4377196289","display_name":"IRIS UniPA (University of Palermo)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I900890020","host_organization_name":"University of Palermo","host_organization_lineage":["https://openalex.org/I900890020"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"info:eu-repo/semantics/article"}],"best_oa_location":{"id":"doi:10.1109/access.2025.3619782","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3619782","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Foundation":[0],"models":[1,137],"(FM)":[2],"have":[3],"proven":[4],"effective":[5],"in":[6,114,205],"many":[7],"speech":[8,12,149],"applications":[9],"except":[10],"for":[11],"enhancement":[13],"(SE),":[14],"where":[15],"FM-based":[16,70,136,167,180],"SE":[17,48,71,170,181],"solutions":[18],"still":[19],"fall":[20],"short":[21],"with":[22,185,202],"respect":[23],"to":[24,31,68,122,132],"specialized":[25],"deep":[26],"architectures.":[27],"This":[28],"work":[29],"seeks":[30],"close":[32,163],"this":[33],"gap":[34,165],"by":[35,111],"systematically":[36],"assessing":[37],"and":[38,52,81,128,168],"contrasting":[39],"leading":[40],"pre-trained":[41],"FM":[42],"architectures":[43],"on":[44,53,154,174,189],"a":[45,203],"commonly":[46],"used":[47],"task,":[49],"namely":[50],"VoiceBank-Demand,":[51],"the":[54,107,115,124,146,155,164,175,178,197],"complex":[55],"Deep":[56],"Noise":[57],"Suppression":[58],"(DNS)":[59],"challenge.":[60],"Furthermore,":[61],"three":[62],"main":[63],"ideas":[64],"will":[65],"be":[66],"leveraged":[67],"boost":[69],"models,":[72],"namely:":[73],"(i)":[74],"Attention-based":[75],"mask":[76],"generation,":[77],"(ii)":[78],"consistency-preserving":[79],"loss,":[80],"(iii)":[82],"perceptual":[83,133,190],"contrast":[84,125],"stretching":[85],"(PCS).":[86],"Specifically,":[87],"frame-level":[88],"representations":[89],"are":[90,109],"effectively":[91],"modeled":[92],"using":[93,193],"conformer":[94],"layers,":[95],"which":[96,145],"leverage":[97],"an":[98,139],"attention":[99],"mechanism.":[100],"Inconsistency":[101],"effects":[102],"of":[103,126,196],"signal":[104,206],"reconstruction":[105],"from":[106,144],"spectrogram":[108],"mitigated":[110],"incorporating":[112],"consistency":[113],"loss":[116],"function.":[117],"Finally,":[118],"PCS":[119,210],"is":[120,150,212],"employed":[121],"improve":[123],"input":[127],"target":[129],"features":[130],"according":[131],"importance.":[134],"All":[135],"generate":[138],"Ideal":[140],"Ratio":[141],"Mask":[142],"(IRM)":[143],"estimated":[147],"clean":[148],"obtained.":[151],"Experimental":[152],"results":[153],"VoiceBank-DEMAND":[156],"task":[157],"demonstrate":[158],"that":[159],"our":[160],"approach":[161],"helps":[162],"between":[166],"SOTA":[169],"solutions.":[171],"When":[172],"tested":[173],"DNS":[176],"challenge,":[177],"proposed":[179,187],"solution":[182],"compares":[183],"favorably":[184],"previously":[186],"approaches":[188],"quality":[191],"metrics,":[192],"only":[194],"10%":[195],"available":[198],"training":[199],"material,":[200],"though":[201],"trade-off":[204],"fidelity":[207],"(SI-SDR)":[208],"when":[209],"preprocessing":[211],"applied.":[213]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
