{"id":"https://openalex.org/W4412888643","doi":"https://doi.org/10.18653/v1/2025.findings-acl.389","title":"Investigating and Enhancing Vision-Audio Capability in Omnimodal Large Language Models","display_name":"Investigating and Enhancing Vision-Audio Capability in Omnimodal Large Language Models","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412888643","doi":"https://doi.org/10.18653/v1/2025.findings-acl.389"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.findings-acl.389","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.389","pdf_url":"https://aclanthology.org/2025.findings-acl.389.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.findings-acl.389.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100685540","display_name":"Rui Hu","orcid":"https://orcid.org/0009-0003-5480-9251"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rui Hu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034561427","display_name":"De\u2010Lai Qiu","orcid":"https://orcid.org/0000-0001-8825-4707"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Delai Qiu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113111581","display_name":"Shuyu Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shuyu Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102005206","display_name":"Jiaming Zhang","orcid":"https://orcid.org/0009-0007-7239-5442"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiaming Zhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100424577","display_name":"Yining Wang","orcid":"https://orcid.org/0000-0001-9410-0392"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yining Wang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101461641","display_name":"Shengping Liu","orcid":"https://orcid.org/0000-0002-7322-0042"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shengping Liu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5023834030","display_name":"Jitao Sang","orcid":"https://orcid.org/0000-0002-0699-3205"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jitao Sang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.2332684,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"7452","last_page":"7463"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9412999749183655,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9412999749183655,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9405999779701233,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9218999743461609,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7401591539382935},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.379727840423584},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.36054447293281555},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.35160258412361145},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.34827446937561035},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.34808290004730225},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3236203193664551}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7401591539382935},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.379727840423584},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.36054447293281555},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.35160258412361145},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.34827446937561035},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.34808290004730225},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3236203193664551}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.18653/v1/2025.findings-acl.389","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.389","pdf_url":"https://aclanthology.org/2025.findings-acl.389.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2503.00059","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.00059","pdf_url":"https://arxiv.org/pdf/2503.00059","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.findings-acl.389","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.389","pdf_url":"https://aclanthology.org/2025.findings-acl.389.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.5400000214576721,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412888643.pdf","grobid_xml":"https://content.openalex.org/works/W4412888643.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2755342338","https://openalex.org/W2779427294","https://openalex.org/W2775347418","https://openalex.org/W2625805835","https://openalex.org/W2079911747","https://openalex.org/W3116076068","https://openalex.org/W3003936178","https://openalex.org/W2145652935","https://openalex.org/W2563206327","https://openalex.org/W2069885731"],"abstract_inverted_index":{"Omnimodal":[0],"Large":[1],"Language":[2],"Models":[3],"(OLLMs)":[4],"have":[5],"shown":[6],"significant":[7],"progress":[8],"in":[9,94,135],"integrating":[10,18],"vision":[11,19,42],"and":[12,20,43,81,131,133],"text,":[13],"but":[14],"still":[15],"struggle":[16],"with":[17],"audio,":[21],"often":[22],"exhibiting":[23],"suboptimal":[24],"performance":[25,137],"when":[26,55],"processing":[27],"audio":[28,44,57,93,130],"queries":[29],"compared":[30],"to":[31,38,49,52,91,98],"text":[32,100],"queries.This":[33],"disparity":[34],"is":[35,107],"primarily":[36],"due":[37],"insufficient":[39],"alignment":[40],"between":[41,129],"modalities":[45],"during":[46],"training,":[47],"leading":[48],"inadequate":[50],"attention":[51],"visual":[53],"information":[54],"using":[56],"queries.To":[58],"mitigate":[59],"this":[60],"issue,":[61],"we":[62],"propose":[63],"a":[64,95],"Self-Knowledge":[65],"Distillation":[66],"(Self-KD)":[67],"training":[68],"method":[69,110],"where":[70],"the":[71,75,79,82,86,89,113,121,127],"vision-text":[72,122],"component":[73,84],"of":[74,116],"OLLM":[76],"serves":[77],"as":[78,85],"teacher":[80],"vision-audio":[83,114],"student.This":[87],"enables":[88],"model":[90],"process":[92],"manner":[96],"analogous":[97],"its":[99],"processing.Our":[101],"experimental":[102],"results":[103,134],"demonstrate":[104],"that":[105],"Self-KD":[106],"an":[108],"effective":[109],"for":[111],"enhancing":[112],"capabilities":[115],"OLLMs":[117],"by":[118],"learning":[119],"from":[120],"components,":[123],"which":[124],"subsequently":[125],"improves":[126],"interaction":[128],"images":[132],"improved":[136],"on":[138],"multimodal":[139],"tasks":[140],"1":[141],".":[142]},"counts_by_year":[],"updated_date":"2026-06-13T07:54:00.901334","created_date":"2025-10-10T00:00:00"}
