{"id":"https://openalex.org/W7148410209","doi":"https://doi.org/10.1109/asru65441.2025.11434619","title":"ZO-ASR: Zeroth-Order Fine-Tuning of Speech Foundation Models without Back-Propagation","display_name":"ZO-ASR: Zeroth-Order Fine-Tuning of Speech Foundation Models without Back-Propagation","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148410209","doi":"https://doi.org/10.1109/asru65441.2025.11434619"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434619","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434619","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120794135","display_name":"Y\u00fcn Peng","orcid":"https://orcid.org/0000-0001-6358-2333"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yuezhang Peng","raw_affiliation_strings":["Shanghai Jiao Tong University,X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132800037","display_name":"Yuxin Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuxin Liu","raw_affiliation_strings":["Shanghai Jiao Tong University,X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132827717","display_name":"Yao Li","orcid":null},"institutions":[{"id":"https://openalex.org/I4210141264","display_name":"Shanghai Civil Aviation College","ror":"https://ror.org/03kkh3m26","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210141264"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yao Li","raw_affiliation_strings":["Shanghai Aviation Electric Co., Ltd"],"affiliations":[{"raw_affiliation_string":"Shanghai Aviation Electric Co., Ltd","institution_ids":["https://openalex.org/I4210141264"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132796473","display_name":"Sheng Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sheng Wang","raw_affiliation_strings":["Shanghai Jiao Tong University,X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034251794","display_name":"Fei Wen","orcid":"https://orcid.org/0000-0002-3083-9611"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fei Wen","raw_affiliation_strings":["Shanghai Jiao Tong University,School of Information Science and Electronic Engineering/School of Integrated Circuits"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,School of Information Science and Electronic Engineering/School of Integrated Circuits","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5132816935","display_name":"Xie Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xie Chen","raw_affiliation_strings":["Shanghai Jiao Tong University,X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5120794135"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.8735149,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9332000017166138,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9332000017166138,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.017000000923871994,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.009700000286102295,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6850000023841858},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5626999735832214},{"id":"https://openalex.org/keywords/domain-adaptation","display_name":"Domain adaptation","score":0.5217999815940857},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.5134999752044678},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.38370001316070557},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.3734999895095825},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.37130001187324524}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.781000018119812},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6850000023841858},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6001999974250793},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5683000087738037},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5626999735832214},{"id":"https://openalex.org/C2776434776","wikidata":"https://www.wikidata.org/wiki/Q19246213","display_name":"Domain adaptation","level":3,"score":0.5217999815940857},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.5134999752044678},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.38370001316070557},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.3734999895095825},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.37130001187324524},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35600000619888306},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.35420000553131104},{"id":"https://openalex.org/C8038995","wikidata":"https://www.wikidata.org/wiki/Q1152135","display_name":"Unsupervised learning","level":2,"score":0.3231000006198883},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3158000111579895},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.31540000438690186},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2915000021457672},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.28859999775886536},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2712000012397766},{"id":"https://openalex.org/C103088060","wikidata":"https://www.wikidata.org/wiki/Q1062839","display_name":"Error detection and correction","level":2,"score":0.2694999873638153}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434619","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434619","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320307110","display_name":"Delta","ror":"https://ror.org/03g9c1e75"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2034996255","https://openalex.org/W2130158090","https://openalex.org/W2149479912","https://openalex.org/W2171830216","https://openalex.org/W2251939518","https://openalex.org/W2289394825","https://openalex.org/W2799473636","https://openalex.org/W2962684181","https://openalex.org/W2963364041","https://openalex.org/W3095410713","https://openalex.org/W3174770825","https://openalex.org/W3209059054","https://openalex.org/W4205991051","https://openalex.org/W4221153728","https://openalex.org/W4319862635","https://openalex.org/W4385807419","https://openalex.org/W4385822587","https://openalex.org/W4385823503","https://openalex.org/W4387321091","https://openalex.org/W4391021666","https://openalex.org/W4402112032","https://openalex.org/W4404782863"],"related_works":[],"abstract_inverted_index":{"Fine-tuning":[0],"pre-trained":[1],"speech":[2],"foundation":[3],"models":[4,48,116],"for":[5,113],"Automatic":[6],"Speech":[7],"Recognition":[8],"(ASR)":[9],"is":[10],"prevalent,":[11],"yet":[12],"constrained":[13],"by":[14,34],"substantial":[15],"GPU":[16],"memory":[17,33],"requirements.":[18],"We":[19],"introduce":[20],"ZO-ASR,":[21],"a":[22,110],"memory-efficient":[23],"Zeroth-Order":[24],"(ZO)":[25],"method":[26],"that":[27],"avoids":[28],"Back-Propagation":[29],"(BP)":[30],"and":[31,57,72],"activation":[32],"estimating":[35],"gradients":[36],"via":[37],"forward":[38],"passes.":[39],"When":[40],"combined":[41],"with":[42],"SGD":[43],"optimizer,":[44],"ZO-ASR-SGD":[45],"fine-tunes":[46],"ASR":[47,115],"using":[49],"only":[50],"inference":[51],"memory.":[52],"Our":[53,106],"evaluation":[54],"spans":[55],"supervised":[56],"unsupervised":[58,91],"tasks.":[59],"For":[60,90],"Supervised":[61],"Domain":[62],"Adaptation":[63,93],"on":[64,94],"Whisper-Large-V3,":[65],"ZO-ASR\u2019s":[66],"multiple":[67],"query":[68],"mechanism":[69],"enhances":[70],"robustness":[71],"achieves":[73],"up":[74],"to":[75,102],"an":[76],"18.9%":[77],"relative":[78],"Word":[79],"Error":[80],"Rate":[81],"reduction":[82],"over":[83],"zero-shot":[84],"baselines,":[85],"outperforming":[86],"existing":[87],"ZO":[88],"methods.":[89],"Test-Time":[92],"Wav2Vec2-Base,":[95],"ZO-ASR":[96],"exhibits":[97],"moderately":[98],"lower":[99],"performance":[100],"compared":[101],"first-order":[103],"optimizer":[104],"Adam.":[105],"BP-free":[107],"approach":[108],"provides":[109],"viable":[111],"solution":[112],"fine-tuning":[114],"in":[117],"computationally":[118],"resource-constrained":[119],"or":[120],"gradient-inaccessible":[121],"scenarios.":[122]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-04-03T00:00:00"}
