{"id":"https://openalex.org/W7130586922","doi":"https://doi.org/10.1109/fllm67465.2025.11391055","title":"Training Strategies for Speech Large Language Models: A Comprehensive Survey","display_name":"Training Strategies for Speech Large Language Models: A Comprehensive Survey","publication_year":2025,"publication_date":"2025-11-25","ids":{"openalex":"https://openalex.org/W7130586922","doi":"https://doi.org/10.1109/fllm67465.2025.11391055"},"language":null,"primary_location":{"id":"doi:10.1109/fllm67465.2025.11391055","is_oa":false,"landing_page_url":"https://doi.org/10.1109/fllm67465.2025.11391055","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 3rd International Conference on Foundation and Large Language Models (FLLM)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126433493","display_name":"Shiqi Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I57206974","display_name":"New York University","ror":"https://ror.org/0190ak572","country_code":"US","type":"education","lineage":["https://openalex.org/I57206974"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shiqi Yang","raw_affiliation_strings":["New York University,New York,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"New York University,New York,USA","institution_ids":["https://openalex.org/I57206974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126374752","display_name":"Ziyi Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I58610484","display_name":"Seattle University","ror":"https://ror.org/02jqc0m91","country_code":"US","type":"education","lineage":["https://openalex.org/I58610484"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ziyi Huang","raw_affiliation_strings":["Independent Researcher,Seattle,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Independent Researcher,Seattle,USA","institution_ids":["https://openalex.org/I58610484"]}]},{"author_position":"last","author":{"id":null,"display_name":"Wengran Xiao","orcid":null},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Wengran Xiao","raw_affiliation_strings":["University of Michigan,New York,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Michigan,New York,USA","institution_ids":["https://openalex.org/I27837315"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.81716756,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"161","last_page":"171"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5569999814033508,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5569999814033508,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.07900000363588333,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.045899998396635056,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5931000113487244},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.515999972820282},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.49129998683929443},{"id":"https://openalex.org/keywords/taxonomy","display_name":"Taxonomy (biology)","score":0.4311999976634979},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.4187000095844269},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.38909998536109924},{"id":"https://openalex.org/keywords/natural","display_name":"Natural (archaeology)","score":0.3889000117778778},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3206000030040741}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.671500027179718},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5931000113487244},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.515999972820282},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.49129998683929443},{"id":"https://openalex.org/C58642233","wikidata":"https://www.wikidata.org/wiki/Q8269924","display_name":"Taxonomy (biology)","level":2,"score":0.4311999976634979},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.4187000095844269},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.38909998536109924},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.3889000117778778},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3206000030040741},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3091000020503998},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.29829999804496765},{"id":"https://openalex.org/C14919245","wikidata":"https://www.wikidata.org/wiki/Q1976109","display_name":"Language technology","level":4,"score":0.29760000109672546},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.2865999937057495},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.28600001335144043},{"id":"https://openalex.org/C2776650193","wikidata":"https://www.wikidata.org/wiki/Q264661","display_name":"Obstacle","level":2,"score":0.2799000144004822},{"id":"https://openalex.org/C74672266","wikidata":"https://www.wikidata.org/wiki/Q815859","display_name":"Language acquisition","level":2,"score":0.2703000009059906},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.2630999982357025},{"id":"https://openalex.org/C2993724205","wikidata":"https://www.wikidata.org/wiki/Q315","display_name":"Human language","level":2,"score":0.2621999979019165},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.25519999861717224},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.25270000100135803}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/fllm67465.2025.11391055","is_oa":false,"landing_page_url":"https://doi.org/10.1109/fllm67465.2025.11391055","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 3rd International Conference on Foundation and Large Language Models (FLLM)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5163899064064026,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W2066452495","https://openalex.org/W2919290281","https://openalex.org/W3200318528","https://openalex.org/W3206252155","https://openalex.org/W3209059054","https://openalex.org/W4225596771","https://openalex.org/W4385245566","https://openalex.org/W4389524500","https://openalex.org/W4392903872","https://openalex.org/W4399027223","https://openalex.org/W4401016696","https://openalex.org/W4401042284","https://openalex.org/W4402111222","https://openalex.org/W4402112288","https://openalex.org/W4402118935","https://openalex.org/W4405706082","https://openalex.org/W4408305013","https://openalex.org/W4408345589","https://openalex.org/W4408401477","https://openalex.org/W4409098775","https://openalex.org/W4409462404","https://openalex.org/W4412171133","https://openalex.org/W4412944325","https://openalex.org/W4412944429","https://openalex.org/W4412944867","https://openalex.org/W4412945298","https://openalex.org/W4413147297","https://openalex.org/W4415432669","https://openalex.org/W4415433259"],"related_works":[],"abstract_inverted_index":{"Generative":[0],"large":[1],"language":[2,15],"models":[3,108],"(LLMs)":[4],"have":[5],"demonstrated":[6],"remarkable":[7],"capabilities":[8,29],"across":[9],"a":[10,113],"wide":[11],"range":[12],"of":[13,37,65,106,116,132],"natural":[14,35],"processing":[16],"tasks.":[17],"Building":[18],"on":[19],"this":[20,81],"progress,":[21],"there":[22],"is":[23],"growing":[24],"interest":[25],"in":[26],"extending":[27],"such":[28,69],"to":[30,51,128],"speech-based":[31],"interaction\u2014an":[32],"essential":[33],"and":[34,76,100,109,125,135],"form":[36],"human":[38],"communication":[39],"with":[40],"broad":[41],"real-world":[42,140],"applications.":[43],"Speech":[44,90,137],"LLMs":[45,138],"require":[46],"more":[47],"than":[48],"modular":[49],"extensions":[50],"existing":[52],"text-based":[53],"architectures;":[54],"they":[55],"demand":[56],"specialized,":[57],"unified":[58],"designs":[59],"that":[60],"address":[61],"the":[62,66,130],"unique":[63],"challenges":[64],"speech":[67],"modality,":[68],"as":[70],"acoustic":[71],"variability,":[72],"audio":[73],"data":[74],"scarcity,":[75],"cross-modal":[77],"alignment":[78],"challenges.":[79],"In":[80],"survey,":[82],"we":[83,111],"systematically":[84],"review":[85],"training":[86],"strategies":[87],"for":[88,139],"developing":[89],"LLMs,":[91],"organizing":[92],"them":[93],"into":[94],"three":[95],"key":[96],"stages:":[97],"pretraining,":[98],"fine-tuning,":[99],"post-training":[101],"alignment.":[102],"Through":[103],"an":[104],"analysis":[105],"recent":[107],"methods,":[110],"present":[112],"structured":[114],"taxonomy":[115],"these":[117],"strategies,":[118],"highlighting":[119],"current":[120],"limitations,":[121],"open":[122],"research":[123],"questions,":[124],"future":[126],"directions":[127],"guide":[129],"development":[131],"robust,":[133],"efficient,":[134],"well-aligned":[136],"deployment.":[141]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-20T00:00:00"}
