{"id":"https://openalex.org/W7127984325","doi":"https://doi.org/10.48550/arxiv.2602.04307","title":"Universal Robust Speech Adaptation for Cross-Domain Speech Recognition and Enhancement","display_name":"Universal Robust Speech Adaptation for Cross-Domain Speech Recognition and Enhancement","publication_year":2026,"publication_date":"2026-02-04","ids":{"openalex":"https://openalex.org/W7127984325","doi":"https://doi.org/10.48550/arxiv.2602.04307"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.04307","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006997733","display_name":"Chien-Chun Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Chien-Chun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125201859","display_name":"Hung-Shin Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Hung-Shin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033417091","display_name":"H. L. Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Hsin-Min","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125193491","display_name":"Berlin Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Berlin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5006997733"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5016000270843506,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5016000270843506,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.46480000019073486,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.008700000122189522,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5927000045776367},{"id":"https://openalex.org/keywords/domain-adaptation","display_name":"Domain adaptation","score":0.5121999979019165},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.5120999813079834},{"id":"https://openalex.org/keywords/channel","display_name":"Channel (broadcasting)","score":0.49549999833106995},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.49000000953674316},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.4090999960899353},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4049000144004822},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.39169999957084656},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.38089999556541443}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.732200026512146},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7293000221252441},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5927000045776367},{"id":"https://openalex.org/C2776434776","wikidata":"https://www.wikidata.org/wiki/Q19246213","display_name":"Domain adaptation","level":3,"score":0.5121999979019165},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.5120999813079834},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.49549999833106995},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.49000000953674316},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4120999872684479},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.4090999960899353},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4049000144004822},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.39169999957084656},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.38089999556541443},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.35499998927116394},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3497999906539917},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.3395000100135803},{"id":"https://openalex.org/C99209842","wikidata":"https://www.wikidata.org/wiki/Q643696","display_name":"Speech perception","level":3,"score":0.335099995136261},{"id":"https://openalex.org/C100675267","wikidata":"https://www.wikidata.org/wiki/Q1371624","display_name":"Background noise","level":2,"score":0.33239999413490295},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.31769999861717224},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.2953000068664551},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.2946000099182129},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.2922999858856201},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.2847999930381775},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2777000069618225},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C103824480","wikidata":"https://www.wikidata.org/wiki/Q185889","display_name":"Time domain","level":2,"score":0.2572000026702881},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.25529998540878296},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.25209999084472656},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.2500999867916107}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.04307","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.04307","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.04307","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.04307","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.5840879678726196,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Pre-trained":[0],"models":[1,23],"for":[2],"automatic":[3],"speech":[4,8,102,108],"recognition":[5],"(ASR)":[6],"and":[7,18,42,57,69,83,160,169,182,199],"enhancement":[9],"(SE)":[10],"have":[11],"exhibited":[12],"remarkable":[13],"capabilities":[14],"under":[15],"matched":[16],"noise":[17,41,68,81,183],"channel":[19,43,70,85,171,181],"conditions.":[20,71],"However,":[21],"these":[22],"often":[24],"suffer":[25],"from":[26],"severe":[27],"performance":[28,198],"degradation":[29],"when":[30],"confronted":[31],"with":[32,89,113,179],"domain":[33,116],"shifts,":[34],"particularly":[35],"in":[36,50,66,158,164,196,201],"the":[37,105,114,139,186],"presence":[38],"of":[39,47,79,107,189,194],"unseen":[40,146],"distortions.":[44],"In":[45],"view":[46],"this,":[48],"we":[49,125],"this":[51],"paper":[52],"present":[53],"URSA-GAN,":[54,190],"a":[55,74,80,84,100,130],"unified":[56],"domain-aware":[58],"generative":[59],"framework":[60],"specifically":[61],"designed":[62],"to":[63,93,145],"mitigate":[64],"mismatches":[65],"both":[67,180],"URSA-GAN":[72,152],"leverages":[73],"dual-embedding":[75],"architecture":[76],"that":[77,109,134,151],"consists":[78],"encoder":[82],"encoder,":[86],"each":[87],"pre-trained":[88],"limited":[90],"in-domain":[91],"data":[92],"capture":[94],"domain-relevant":[95],"representations.":[96],"These":[97],"embeddings":[98,140],"condition":[99],"GAN-based":[101],"generator,":[103],"facilitating":[104],"synthesis":[106],"is":[110],"acoustically":[111],"aligned":[112],"target":[115],"while":[117],"preserving":[118],"phonetic":[119],"content.":[120],"To":[121],"enhance":[122],"generalization":[123,187],"further,":[124],"propose":[126],"dynamic":[127],"stochastic":[128],"perturbation,":[129],"novel":[131],"regularization":[132],"technique":[133],"introduces":[135],"controlled":[136],"variability":[137],"into":[138],"during":[141],"generation,":[142],"promoting":[143],"robustness":[144],"domains.":[147],"Empirical":[148],"results":[149],"demonstrate":[150],"effectively":[153],"reduces":[154],"character":[155],"error":[156],"rates":[157],"ASR":[159,197],"improves":[161],"perceptual":[162],"metrics":[163],"SE":[165,202],"across":[166],"diverse":[167],"noisy":[168],"mismatched":[170],"scenarios.":[172],"Notably,":[173],"evaluations":[174],"on":[175],"compound":[176],"test":[177],"conditions":[178],"degradations":[184],"confirm":[185],"ability":[188],"yielding":[191],"relative":[192],"improvements":[193],"16.16%":[195],"15.58%":[200],"metrics.":[203]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-07T00:00:00"}
