{"id":"https://openalex.org/W7148442809","doi":"https://doi.org/10.1109/asru65441.2025.11434780","title":"Omni-R1: Do You Really Need Audio to Fine-Tune Your Audio LLM?","display_name":"Omni-R1: Do You Really Need Audio to Fine-Tune Your Audio LLM?","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148442809","doi":"https://doi.org/10.1109/asru65441.2025.11434780"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434780","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434780","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010091252","display_name":"Andrew Rouditchenko","orcid":"https://orcid.org/0000-0002-0063-3612"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Andrew Rouditchenko","raw_affiliation_strings":["MIT CSAIL"],"affiliations":[{"raw_affiliation_string":"MIT CSAIL","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045247662","display_name":"Saurabhchand Bhati","orcid":"https://orcid.org/0000-0001-6477-3895"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Saurabhchand Bhati","raw_affiliation_strings":["MIT CSAIL"],"affiliations":[{"raw_affiliation_string":"MIT CSAIL","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061048633","display_name":"Edson Araujo","orcid":"https://orcid.org/0000-0003-0585-5473"},"institutions":[{"id":"https://openalex.org/I114090438","display_name":"Goethe University Frankfurt","ror":"https://ror.org/04cvxnb49","country_code":"DE","type":"education","lineage":["https://openalex.org/I114090438"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Edson Araujo","raw_affiliation_strings":["Goethe University of Frankfurt"],"affiliations":[{"raw_affiliation_string":"Goethe University of Frankfurt","institution_ids":["https://openalex.org/I114090438"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132789909","display_name":"Samuel Thomas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Samuel Thomas","raw_affiliation_strings":["IBM Research AI"],"affiliations":[{"raw_affiliation_string":"IBM Research AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132794447","display_name":"Hilde Kuehne","orcid":null},"institutions":[{"id":"https://openalex.org/I114090438","display_name":"Goethe University Frankfurt","ror":"https://ror.org/04cvxnb49","country_code":"DE","type":"education","lineage":["https://openalex.org/I114090438"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Hilde Kuehne","raw_affiliation_strings":["Goethe University of Frankfurt"],"affiliations":[{"raw_affiliation_string":"Goethe University of Frankfurt","institution_ids":["https://openalex.org/I114090438"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132792222","display_name":"Rogerio Feris","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rogerio Feris","raw_affiliation_strings":["IBM Research AI"],"affiliations":[{"raw_affiliation_string":"IBM Research AI","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5132824681","display_name":"James R. Glass","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"James Glass","raw_affiliation_strings":["MIT CSAIL"],"affiliations":[{"raw_affiliation_string":"MIT CSAIL","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5010091252"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.3637,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.95468484,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5199000239372253,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5199000239372253,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.26249998807907104,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.022700000554323196,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.4083000123500824},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.3880000114440918},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.38749998807907104},{"id":"https://openalex.org/keywords/audio-equipment","display_name":"Audio equipment","score":0.3418999910354614},{"id":"https://openalex.org/keywords/sound-recording-and-reproduction","display_name":"Sound recording and reproduction","score":0.3221000134944916},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.3165999948978424}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7275000214576721},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5062000155448914},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.4083000123500824},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39570000767707825},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.3880000114440918},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.38749998807907104},{"id":"https://openalex.org/C2778488704","wikidata":"https://www.wikidata.org/wiki/Q15190726","display_name":"Audio equipment","level":2,"score":0.3418999910354614},{"id":"https://openalex.org/C128422554","wikidata":"https://www.wikidata.org/wiki/Q20077126","display_name":"Sound recording and reproduction","level":2,"score":0.3221000134944916},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.3181999921798706},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.3165999948978424},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3100000023841858},{"id":"https://openalex.org/C87687168","wikidata":"https://www.wikidata.org/wiki/Q173114","display_name":"Digital audio","level":4,"score":0.2953000068664551},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.28859999775886536},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.287200003862381},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2847000062465668},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.27790001034736633}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434780","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434780","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W3015371781","https://openalex.org/W4304098310","https://openalex.org/W4391021627","https://openalex.org/W4392909390","https://openalex.org/W4401044042","https://openalex.org/W4404784428","https://openalex.org/W4411688496","https://openalex.org/W7133212838"],"related_works":[],"abstract_inverted_index":{"We":[0,88],"propose":[1],"Omni-R1":[2,37],"which":[3],"fine-tunes":[4],"a":[5,91,99],"recent":[6,30],"multi-modal":[7],"LLM,":[8],"Qwen2.5-Omni,":[9],"on":[10,28,42,52,98],"an":[11],"audio":[12,70,97],"question":[13],"answering":[14],"dataset":[15,101],"with":[16,67],"the":[17,29,39,43,53,60,76,106],"reinforcement":[18],"learning":[19],"method":[20],"GRPO.":[21],"This":[22],"leads":[23],"to":[24,84],"new":[25],"State-of-the-Art":[26],"performance":[27,61,77],"MMAU":[31],"and":[32,47,55,68,71],"MMAR":[33],"benchmarks.":[34],"On":[35],"MMAU,":[36],"achieves":[38],"highest":[40],"accuracies":[41],"sounds,":[44],"music,":[45],"speech,":[46],"overall":[48],"average":[49],"categories,":[50],"both":[51,66],"Test-mini":[54],"Test-full":[56],"splits.":[57],"To":[58],"understand":[59],"improvement,":[62],"we":[63],"tested":[64],"models":[65],"without":[69,96],"found":[72],"that":[73,94],"much":[74],"of":[75],"improvement":[78],"from":[79],"GRPO":[80],"could":[81],"be":[82],"attributed":[83],"better":[85],"text-based":[86],"reasoning.":[87],"also":[89],"made":[90],"surprising":[92],"discovery":[93],"fine-tuning":[95],"text-only":[100],"was":[102],"effective":[103],"at":[104],"improving":[105],"audio-based":[107],"performance.":[108]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2026-04-03T00:00:00"}
