{"id":"https://openalex.org/W7138419087","doi":"https://doi.org/10.1609/aaai.v40i14.38183","title":"When Eyes and Ears Disagree: Can MLLMs Discern Audio-Visual Confusion?","display_name":"When Eyes and Ears Disagree: Can MLLMs Discern Audio-Visual Confusion?","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138419087","doi":"https://doi.org/10.1609/aaai.v40i14.38183"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i14.38183","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i14.38183","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i14.38183","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125707546","display_name":"Qilang Ye","orcid":null},"institutions":[{"id":"https://openalex.org/I4210132990","display_name":"State Key Laboratory of Cryptology","ror":"https://ror.org/02pn5rj08","country_code":"CN","type":"government","lineage":["https://openalex.org/I4210132990"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Qilang Ye","raw_affiliation_strings":["VCIP & TMCC & DISSec, College of Computer Science & College of Cryptology and Cyber Science, Nankai University\nZhongguancun Academy"],"affiliations":[{"raw_affiliation_string":"VCIP & TMCC & DISSec, College of Computer Science & College of Cryptology and Cyber Science, Nankai University\nZhongguancun Academy","institution_ids":["https://openalex.org/I4210132990"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129685693","display_name":"Wei Zeng","orcid":null},"institutions":[{"id":"https://openalex.org/I166382143","display_name":"Moscow State University of Printing Arts","ror":"https://ror.org/02e16wz39","country_code":"RU","type":"education","lineage":["https://openalex.org/I166382143"]}],"countries":["RU"],"is_corresponding":false,"raw_author_name":"Wei Zeng","raw_affiliation_strings":["Zhongguancun Academy"],"affiliations":[{"raw_affiliation_string":"Zhongguancun Academy","institution_ids":["https://openalex.org/I166382143"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129683271","display_name":"Meng Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I44445938","display_name":"Shandong Jianzhu University","ror":"https://ror.org/01gbfax37","country_code":"CN","type":"education","lineage":["https://openalex.org/I44445938"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Meng Liu","raw_affiliation_strings":["Zhongguancun Academy\nSchool of Computer Science and Technology, Shandong Jianzhu University"],"affiliations":[{"raw_affiliation_string":"Zhongguancun Academy\nSchool of Computer Science and Technology, Shandong Jianzhu University","institution_ids":["https://openalex.org/I44445938"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129733679","display_name":"Jie M. Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I2799285752","display_name":"Bay Institute","ror":"https://ror.org/02bgarq81","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I2799285752"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jie Zhang","raw_affiliation_strings":["School of Information Science and Technology, Great Bay University"],"affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, Great Bay University","institution_ids":["https://openalex.org/I2799285752"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129727013","display_name":"Yupeng Hu","orcid":null},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yupeng Hu","raw_affiliation_strings":["School of Software Engineering, Shandong University"],"affiliations":[{"raw_affiliation_string":"School of Software Engineering, Shandong University","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129686365","display_name":"Zitong Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I2799850029","display_name":"Dongguan University of Technology","ror":"https://ror.org/01m8p7q42","country_code":"CN","type":"education","lineage":["https://openalex.org/I2799850029"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zitong Yu","raw_affiliation_strings":["School of Information Science and Technology, Great Bay University\nDongguan Key Laboratory for Intelligence and Information Technology"],"affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, Great Bay University\nDongguan Key Laboratory for Intelligence and Information Technology","institution_ids":["https://openalex.org/I2799850029"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129727305","display_name":"Yu Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I4210132990","display_name":"State Key Laboratory of Cryptology","ror":"https://ror.org/02pn5rj08","country_code":"CN","type":"government","lineage":["https://openalex.org/I4210132990"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu Zhou","raw_affiliation_strings":["VCIP & TMCC & DISSec, College of Computer Science & College of Cryptology and Cyber Science, Nankai University\nZhongguancun Academy"],"affiliations":[{"raw_affiliation_string":"VCIP & TMCC & DISSec, College of Computer Science & College of Cryptology and Cyber Science, Nankai University\nZhongguancun Academy","institution_ids":["https://openalex.org/I4210132990"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5125707546"],"corresponding_institution_ids":["https://openalex.org/I4210132990"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.79626866,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"14","first_page":"11955","last_page":"11963"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8648999929428101,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8648999929428101,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.0544000007212162,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.013399999588727951,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6621000170707703},{"id":"https://openalex.org/keywords/ask-price","display_name":"Ask price","score":0.5170999765396118},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.46540001034736633},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.4562999904155731},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3458000123500824},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.287200003862381}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.751800000667572},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6621000170707703},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6061999797821045},{"id":"https://openalex.org/C90329073","wikidata":"https://www.wikidata.org/wiki/Q914232","display_name":"Ask price","level":2,"score":0.5170999765396118},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.46540001034736633},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.4562999904155731},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3815000057220459},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3741999864578247},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.35899999737739563},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3458000123500824},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.287200003862381},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.28679999709129333},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.28610000014305115},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.28110000491142273},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2709999978542328},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.265500009059906},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.2639000117778778},{"id":"https://openalex.org/C20894473","wikidata":"https://www.wikidata.org/wiki/Q1116105","display_name":"Object model","level":3,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i14.38183","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i14.38183","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i14.38183","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i14.38183","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.7485024929046631,"display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Can":[0],"Multimodal":[1],"Large":[2,110],"Language":[3,112],"Models":[4],"(MLLMs)":[5],"discern":[6],"confused":[7],"objects":[8],"that":[9,57,87,131,173],"are":[10],"visually":[11,72,101],"present":[12],"but":[13],"audio-absent?":[14],"To":[15,99,143],"study":[16],"this,":[17],"we":[18,79,104,124,149],"introduce":[19,80,105,150],"a":[20,82,109,126],"new":[21],"benchmark,":[22],"AV-ConfuseBench,":[23],"which":[24],"simulates":[25],"an":[26,36,106,145],"\u201cAudio-Visual":[27],"Confusion\u201d":[28],"scene":[29],"by":[30,76,177],"modifying":[31],"the":[32,39,43,91,116,139,156,180],"corresponding":[33],"sound":[34],"of":[35,158],"object":[37,45],"in":[38],"video,":[40],"e.g.,":[41],"mute":[42],"sounding":[44],"and":[46,62,169],"ask":[47],"MLLMs":[48,133],"\u201cIs":[49],"there":[50],"a/an":[51],"{muted-object}":[52],"sound\u201d.":[53],"Experimental":[54],"results":[55],"reveal":[56],"MLLMs,":[58],"such":[59],"as":[60,115],"Qwen2.5-Omni":[61,92],"Gemini":[63],"2.5,":[64],"struggle":[65],"to":[66,71,119,134,154],"discriminate":[67],"non-existent":[68],"audio":[69],"due":[70],"dominated":[73,102],"reasoning.":[74,122],"Motivated":[75],"this":[77],"observation,":[78],"RL-CoMM,":[81],"Reinforcement":[83],"Learning-based":[84],"Collaborative":[85],"Multi-MLLM":[86],"is":[88],"built":[89],"upon":[90],"foundation.":[93],"RL-CoMM":[94,174],"includes":[95],"two":[96],"stages:":[97],"1)":[98],"alleviate":[100],"ambiguities,":[103],"external":[107],"model,":[108],"Audio":[111],"Model":[113],"(LALM),":[114],"reference":[117],"model":[118,182],"generate":[120],"audio-only":[121,140],"Then,":[123],"design":[125],"Step-wise":[127],"Reasoning":[128],"Reward":[129],"function":[130],"enables":[132],"self-improve":[135],"audio-visual":[136,166,170],"reasoning":[137,161],"with":[138,183],"reference.":[141],"2)":[142],"ensure":[144],"accurate":[146],"answer":[147],"prediction,":[148],"Answer-centered":[151],"Confidence":[152],"Optimization":[153],"reduce":[155],"uncertainty":[157],"potential":[159],"heterogeneous":[160],"differences.":[162],"Extensive":[163],"experiments":[164],"on":[165],"question":[167],"answering":[168],"hallucination":[171],"show":[172],"improves":[175],"accuracy":[176],"10~30%":[178],"over":[179],"baseline":[181],"limited":[184],"training":[185],"data.":[186]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
