{"id":"https://openalex.org/W7134977886","doi":"https://doi.org/10.48550/arxiv.2603.09714","title":"MUGEN: Evaluating and Improving Multi-audio Understanding of Large Audio-Language Models","display_name":"MUGEN: Evaluating and Improving Multi-audio Understanding of Large Audio-Language Models","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7134977886","doi":"https://doi.org/10.48550/arxiv.2603.09714"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.09714","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09714","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.09714","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128779931","display_name":"Chih-Kai Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Chih-Kai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101943312","display_name":"Yi-Ting Tsai","orcid":"https://orcid.org/0000-0001-6098-8944"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tsai, Yun-Shao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045097607","display_name":"Yukai Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Yu-Kai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128707379","display_name":"Ping-Le Tsai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tsai, Ping-Le","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128783803","display_name":"Yen-Ting Piao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Piao, Yen-Ting","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101539571","display_name":"Hung-Wei Chen","orcid":"https://orcid.org/0000-0002-8185-3544"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Hung-Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128708040","display_name":"Ting-Lin Hsiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hsiao, Ting-Lin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104254656","display_name":"Yun-Man Hsu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hsu, Yun-Man","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128699884","display_name":"Ke-Han Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Ke-Han","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128705370","display_name":"Hung-yi Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Hung-yi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.730400025844574,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.730400025844574,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.0803999975323677,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.07440000027418137,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5842999815940857},{"id":"https://openalex.org/keywords/permutation","display_name":"Permutation (music)","score":0.46129998564720154},{"id":"https://openalex.org/keywords/strengths-and-weaknesses","display_name":"Strengths and weaknesses","score":0.4388999938964844},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.39899998903274536},{"id":"https://openalex.org/keywords/order","display_name":"Order (exchange)","score":0.3573000133037567},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.32359999418258667},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.2791999876499176}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.75},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5842999815940857},{"id":"https://openalex.org/C21308566","wikidata":"https://www.wikidata.org/wiki/Q7169365","display_name":"Permutation (music)","level":2,"score":0.46129998564720154},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4528999924659729},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4456999897956848},{"id":"https://openalex.org/C63882131","wikidata":"https://www.wikidata.org/wiki/Q17122954","display_name":"Strengths and weaknesses","level":2,"score":0.4388999938964844},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.39899998903274536},{"id":"https://openalex.org/C182306322","wikidata":"https://www.wikidata.org/wiki/Q1779371","display_name":"Order (exchange)","level":2,"score":0.3573000133037567},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3294999897480011},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.32359999418258667},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.2791999876499176},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.27480000257492065},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.2542000114917755},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2524999976158142},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.25220000743865967},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.09714","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09714","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.09714","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09714","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"multi-audio":[1,34],"understanding":[2],"is":[3],"critical":[4],"for":[5,108],"large":[6],"audio-language":[7],"models":[8,73],"(LALMs),":[9],"it":[10],"remains":[11],"underexplored.":[12],"We":[13,55],"introduce":[14],"MUGEN,":[15],"a":[16,52,106],"comprehensive":[17],"benchmark":[18],"evaluating":[19,109],"this":[20,86],"capability":[21],"across":[22],"speech,":[23],"general":[24],"audio,":[25],"and":[26,36,60,104],"music.":[27],"Our":[28],"experiments":[29],"reveal":[30],"consistent":[31],"weaknesses":[32],"in":[33,101],"settings,":[35],"performance":[37,93],"degrades":[38],"sharply":[39],"as":[40,51],"the":[41,67],"number":[42],"of":[43,69],"concurrent":[44],"audio":[45,70],"inputs":[46],"increases,":[47],"identifying":[48],"input":[49],"scaling":[50],"fundamental":[53],"bottleneck.":[54],"further":[56,91],"investigate":[57],"training-free":[58],"strategies":[59],"observe":[61],"that":[62],"Audio-Permutational":[63],"Self-Consistency,":[64],"which":[65],"diversifies":[66],"order":[68],"candidates,":[71],"helps":[72],"form":[74],"more":[75],"robust":[76],"aggregated":[77],"predictions,":[78],"yielding":[79],"up":[80],"to":[81,94],"6.28%":[82],"accuracy":[83],"gains.":[84],"Combining":[85],"permutation":[87],"strategy":[88],"with":[89],"Chain-of-Thought":[90],"improves":[92],"6.74%.":[95],"These":[96],"results":[97],"expose":[98],"blind":[99],"spots":[100],"current":[102],"LALMs":[103],"provide":[105],"foundation":[107],"complex":[110],"auditory":[111],"comprehension.":[112]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-12T00:00:00"}
