{"id":"https://openalex.org/W7123336686","doi":"https://doi.org/10.1109/jstsp.2026.3653157","title":"SLAM-LLM: A Modular, Open-Source Multimodal Large Language Model Framework and Best Practice for Speech, Language, Audio and Music Processing","display_name":"SLAM-LLM: A Modular, Open-Source Multimodal Large Language Model Framework and Best Practice for Speech, Language, Audio and Music Processing","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7123336686","doi":"https://doi.org/10.1109/jstsp.2026.3653157"},"language":"en","primary_location":{"id":"doi:10.1109/jstsp.2026.3653157","is_oa":true,"landing_page_url":"https://doi.org/10.1109/jstsp.2026.3653157","pdf_url":null,"source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1109/jstsp.2026.3653157","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101838736","display_name":"Ziyang Ma","orcid":"https://orcid.org/0000-0002-8195-3262"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ziyang Ma","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-8195-3262","affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111310847","display_name":"Guanrou Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guanrou Yang","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034677845","display_name":"Wenxi Chen","orcid":"https://orcid.org/0009-0005-8303-930X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenxi Chen","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122900751","display_name":"Zhifu Gao","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhifu Gao","raw_affiliation_strings":["Tongyi Lab, Alibaba Group, Hangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tongyi Lab, Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101230916","display_name":"Yexing Du","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136793","display_name":"Peng Cheng Laboratory","ror":"https://ror.org/03qdqbt06","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210136793"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yexing Du","raw_affiliation_strings":["Peng Cheng Laboratory, Guangdong, China"],"raw_orcid":"https://orcid.org/0009-0003-0513-2635","affiliations":[{"raw_affiliation_string":"Peng Cheng Laboratory, Guangdong, China","institution_ids":["https://openalex.org/I4210136793"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104326367","display_name":"Xiquan Li","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiquan Li","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044538070","display_name":"Zhisheng Zhen","orcid":null},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhisheng Zheng","raw_affiliation_strings":["The University of Texas at Austin, Austin, TX, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The University of Texas at Austin, Austin, TX, USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100514505","display_name":"Haina Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haina Zhu","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0005-6286-5530","affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110937314","display_name":"Jianheng Zhuo","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianheng Zhuo","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104331000","display_name":"Zheshu Song","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zheshu Song","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089618727","display_name":"Ruiyang Xu","orcid":"https://orcid.org/0000-0002-8521-3181"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruiyang Xu","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122844411","display_name":"Tiranrui Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tiranrui Wang","raw_affiliation_strings":["Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0000-0002-2765-5889","affiliations":[{"raw_affiliation_string":"Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yifan Yang","orcid":"https://orcid.org/0009-0003-0588-1812"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yifan Yang","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0003-0588-1812","affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yanqiao Zhu","orcid":"https://orcid.org/0009-0001-4066-2039"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanqiao Zhu","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0001-4066-2039","affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhikang Niu","orcid":"https://orcid.org/0009-0007-1880-7434"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhikang Niu","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0007-1880-7434","affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113065048","display_name":"L. Xue","orcid":null},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Liumeng Xue","raw_affiliation_strings":["The Hong Kong University of Science and Technology, Hong Kong, SAR, China"],"raw_orcid":"https://orcid.org/0000-0003-2815-8494","affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology, Hong Kong, SAR, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027384378","display_name":"Yinghao Ma","orcid":"https://orcid.org/0000-0002-9984-2229"},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Yinghao Ma","raw_affiliation_strings":["Queen Mary University of London, London, U.K"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Queen Mary University of London, London, U.K","institution_ids":["https://openalex.org/I166337079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122885308","display_name":"Ruibin Yuan","orcid":null},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Ruibin Yuan","raw_affiliation_strings":["The Hong Kong University of Science and Technology, Hong Kong, SAR, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology, Hong Kong, SAR, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122848586","display_name":"Shiliang Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiliang Zhang","raw_affiliation_strings":["Tongyi Lab, Alibaba Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-1718-3686","affiliations":[{"raw_affiliation_string":"Tongyi Lab, Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122871926","display_name":"Kai Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Yu","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-7102-9826","affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120309277","display_name":"Eng Siong Chng","orcid":null},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Eng Siong Chng","raw_affiliation_strings":["Nanyang Technological University, Singapore"],"raw_orcid":"https://orcid.org/0000-0001-6257-7399","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5122900899","display_name":"Xie Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xie Chen","raw_affiliation_strings":["X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-7423-617X","affiliations":[{"raw_affiliation_string":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":5,"institutions_distinct_count":22,"corresponding_author_ids":["https://openalex.org/A5101838736"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.08619281,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"20","issue":"1","first_page":"63","last_page":"76"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.3262999951839447,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.3262999951839447,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.3181999921798706,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.03799999877810478,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.8220999836921692},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5171999931335449},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4887999892234802},{"id":"https://openalex.org/keywords/best-practice","display_name":"Best practice","score":0.4747999906539917},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.4348999857902527},{"id":"https://openalex.org/keywords/digital-audio","display_name":"Digital audio","score":0.41100001335144043},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3986999988555908},{"id":"https://openalex.org/keywords/audio-analyzer","display_name":"Audio analyzer","score":0.3743000030517578}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8529000282287598},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.8220999836921692},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5171999931335449},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4887999892234802},{"id":"https://openalex.org/C184356942","wikidata":"https://www.wikidata.org/wiki/Q830382","display_name":"Best practice","level":2,"score":0.4747999906539917},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4657000005245209},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.4348999857902527},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.42730000615119934},{"id":"https://openalex.org/C87687168","wikidata":"https://www.wikidata.org/wiki/Q173114","display_name":"Digital audio","level":4,"score":0.41100001335144043},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3986999988555908},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.3743000030517578},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.36899998784065247},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.3677000105381012},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.35899999737739563},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.3440999984741211},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.3391999900341034},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.33739998936653137},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.31040000915527344},{"id":"https://openalex.org/C73520026","wikidata":"https://www.wikidata.org/wiki/Q7229091","display_name":"Pop music automation","level":4,"score":0.2831000089645386},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.2818000018596649},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2800999879837036},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.27720001339912415},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.27219998836517334},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2630000114440918}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/jstsp.2026.3653157","is_oa":true,"landing_page_url":"https://doi.org/10.1109/jstsp.2026.3653157","pdf_url":null,"source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2601.09385","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2601.09385","pdf_url":"https://arxiv.org/pdf/2601.09385","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:dr.ntu.edu.sg:10356/212320","is_oa":false,"landing_page_url":"https://hdl.handle.net/10356/212320","pdf_url":null,"source":{"id":"https://openalex.org/S4306402609","display_name":"DR-NTU (Nanyang Technological University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I172675005","host_organization_name":"Nanyang Technological University","host_organization_lineage":["https://openalex.org/I172675005"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Journal Article"}],"best_oa_location":{"id":"doi:10.1109/jstsp.2026.3653157","is_oa":true,"landing_page_url":"https://doi.org/10.1109/jstsp.2026.3653157","pdf_url":null,"source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.6996793150901794,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G3278014984","display_name":null,"funder_award_id":"U23B2018","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8721642152","display_name":null,"funder_award_id":"2021SHZDZX0102","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"recent":[1],"surge":[2],"in":[3],"open-source":[4,77,185],"Multimodal":[5],"Large":[6],"Language":[7],"Models":[8],"(MLLM)":[9],"frameworks,":[10],"such":[11],"as":[12,32],"LLaVA,":[13],"provides":[14,95],"a":[15,63,96],"convenient":[16],"kickoff":[17],"for":[18,42,116,172],"artificial":[19],"intelligence":[20],"developers":[21],"and":[22,37,48,58,70,91,104,113,133,149,169,187,199],"researchers.":[23,173],"However,":[24],"most":[25],"of":[26,45,55,65,99,138],"the":[27,33,43,53,190,195],"MLLM":[28],"frameworks":[29],"take":[30],"vision":[31],"main":[34],"input":[35],"modality,":[36],"provide":[38],"limited":[39],"in-depth":[40],"support":[41],"modality":[44],"speech,":[46,88,197],"audio,":[47,90],"music.":[49],"This":[50],"situation":[51],"hinders":[52],"development":[54],"audio-language":[56],"models,":[57],"forces":[59],"researchers":[60],"to":[61,82,177,192,194],"spend":[62],"lot":[64],"effort":[66],"on":[67,87,189],"code":[68],"writing":[69],"hyperparameter":[71],"tuning.":[72],"We":[73,160,174],"present":[74],"SLAM-LLM,":[75],"an":[76],"deep":[78],"learning":[79],"framework":[80],"designed":[81],"train":[83],"customized":[84],"MLLMs,":[85],"focused":[86],"language,":[89],"music":[92,200],"processing.":[93,201],"SLAM-LLM":[94,108,162],"modular":[97],"configuration":[98],"different":[100],"encoders,":[101],"projectors,":[102],"LLMs,":[103],"parameter-efficient":[105],"fine-tuning":[106],"plugins.":[107],"also":[109,154],"includes":[110],"detailed":[111],"training":[112,171],"inference":[114],"recipes":[115,140],"mainstream":[117],"tasks,":[118],"along":[119],"with":[120],"high-performance":[121],"checkpoints":[122],"like":[123],"LLM-based":[124,196],"Automatic":[125],"Speech":[126],"Recognition":[127],"(ASR),":[128],"Automated":[129],"Audio":[130],"Captioning":[131,135],"(AAC),":[132],"Music":[134],"(MC).":[136],"Some":[137],"these":[139],"have":[141,153],"already":[142],"reached":[143],"or":[144],"are":[145,175],"nearing":[146],"state-of-the-art":[147],"performance,":[148],"some":[150],"relevant":[151],"techniques":[152],"been":[155],"accepted":[156],"by":[157],"academic":[158],"papers.":[159],"hope":[161],"will":[163],"accelerate":[164],"iteration,":[165],"development,":[166],"data":[167],"engineering,":[168],"model":[170],"committed":[176],"continually":[178],"pushing":[179],"forward":[180],"audio-based":[181],"MLLMs":[182],"through":[183],"this":[184],"framework,":[186],"call":[188],"community":[191],"contribute":[193],"audio":[198]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2026-01-14T00:00:00"}
