{"id":"https://openalex.org/W7161722918","doi":"https://doi.org/10.48550/arxiv.2605.16403","title":"When Vision Speaks for Sound","display_name":"When Vision Speaks for Sound","publication_year":2026,"publication_date":"2026-05-13","ids":{"openalex":"https://openalex.org/W7161722918","doi":"https://doi.org/10.48550/arxiv.2605.16403"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.16403","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16403","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.16403","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136401977","display_name":"Xiaofei Wen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen, Xiaofei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136479568","display_name":"Wenjie Jacky Mo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mo, Wenjie Jacky","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136457176","display_name":"Xingyu Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Xingyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136485022","display_name":"Rui Cai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Rui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136369771","display_name":"Tinghui Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Tinghui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041790231","display_name":"Wendi Li","orcid":"https://orcid.org/0000-0001-9764-6147"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Wendi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136484063","display_name":"Yanan Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Yanan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136479036","display_name":"Muhao Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Muhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136458360","display_name":"Peng Qi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qi, Peng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.2567000091075897,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.2567000091075897,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.22380000352859497,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.14300000667572021,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/hallucinating","display_name":"Hallucinating","score":0.8675000071525574},{"id":"https://openalex.org/keywords/counterfactual-thinking","display_name":"Counterfactual thinking","score":0.7225000262260437},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5285000205039978},{"id":"https://openalex.org/keywords/sound","display_name":"Sound (geography)","score":0.47200000286102295},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.44429999589920044},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.42669999599456787},{"id":"https://openalex.org/keywords/mode","display_name":"Mode (computer interface)","score":0.4194999933242798},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.41370001435279846},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.36820000410079956}],"concepts":[{"id":"https://openalex.org/C2911011789","wikidata":"https://www.wikidata.org/wiki/Q130741","display_name":"Hallucinating","level":2,"score":0.8675000071525574},{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.7225000262260437},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6366000175476074},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5285000205039978},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5065000057220459},{"id":"https://openalex.org/C203718221","wikidata":"https://www.wikidata.org/wiki/Q491713","display_name":"Sound (geography)","level":2,"score":0.47200000286102295},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.44429999589920044},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.42669999599456787},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.4194999933242798},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.41370001435279846},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.36820000410079956},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3605000078678131},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.3537999987602234},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.32350000739097595},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.31290000677108765},{"id":"https://openalex.org/C140631703","wikidata":"https://www.wikidata.org/wiki/Q34678","display_name":"Stereophonic sound","level":3,"score":0.3098999857902527},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3059000074863434},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.2969000041484833},{"id":"https://openalex.org/C146044194","wikidata":"https://www.wikidata.org/wiki/Q5157334","display_name":"Computational photography","level":4,"score":0.29179999232292175},{"id":"https://openalex.org/C128422554","wikidata":"https://www.wikidata.org/wiki/Q20077126","display_name":"Sound recording and reproduction","level":2,"score":0.2879999876022339},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.27730000019073486},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.27559998631477356},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.27300000190734863},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.26989999413490295},{"id":"https://openalex.org/C499572226","wikidata":"https://www.wikidata.org/wiki/Q1937950","display_name":"Sound design","level":3,"score":0.2671000063419342},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.2563000023365021},{"id":"https://openalex.org/C2780665704","wikidata":"https://www.wikidata.org/wiki/Q959298","display_name":"Intervention (counseling)","level":2,"score":0.25519999861717224}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.16403","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16403","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.16403","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16403","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.4728984236717224,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Despite":[0],"rapid":[1],"progress":[2],"in":[3,13,66],"video-capable":[4],"MLLMs,":[5],"we":[6,93,124],"find":[7],"that":[8],"their":[9],"apparent":[10],"audio":[11,33,81,104,135],"understanding":[12],"videos":[14],"is":[15],"often":[16],"vision-driven:":[17],"models":[18,43,47,68],"rely":[19],"on":[20,101,167],"visual":[21,83],"cues":[22],"to":[23],"infer":[24],"or":[25],"hallucinate":[26],"acoustic":[27],"information,":[28],"rather":[29],"than":[30],"verifying":[31,78],"the":[32,80,143,155],"stream.":[34],"This":[35],"issue":[36],"appears":[37],"across":[38,154],"both":[39],"state-of-the-art":[40],"open-source":[41],"omni":[42],"and":[44,53,82,116,170],"leading":[45],"closed-source":[46],"from":[48],"providers":[49],"such":[50],"as":[51,60],"Google":[52],"OpenAI.":[54],"We":[55],"characterize":[56],"this":[57,91],"failure":[58],"mode":[59],"an":[61,96],"audio-visual":[62,120,171],"Clever":[63],"Hans":[64],"effect,":[65],"which":[67,107,112,118],"appear":[69],"(falsely)":[70],"audio-grounded,":[71],"but":[72],"actually":[73],"exploit":[74],"visual-acoustic":[75],"correlations":[76],"without":[77],"whether":[79],"streams":[84],"are":[85],"truly":[86],"aligned.":[87],"To":[88],"systematically":[89],"study":[90,126],"behavior,":[92],"introduce":[94],"Thud,":[95],"intervention-driven":[97],"probing":[98],"framework":[99],"based":[100],"three":[102,156],"counterfactual":[103],"edits:":[105],"Shift,":[106],"tests":[108,113,119],"temporal":[109],"synchronization;":[110],"Mute,":[111],"sound":[114],"existence;":[115],"Swap,":[117],"consistency.":[121],"Beyond":[122],"diagnosis,":[123],"further":[125],"a":[127],"two-stage":[128],"alignment":[129],"recipe:":[130],"intervention-derived":[131],"preference":[132],"pairs":[133],"teach":[134],"verification,":[136],"while":[137,163],"event-level":[138],"general":[139,168],"video":[140,169],"preferences":[141],"regularize":[142],"model":[144],"against":[145],"over-specialization.":[146],"Our":[147],"best":[148],"10K-sample":[149],"recipe":[150],"improves":[151],"average":[152],"performance":[153,166],"intervention":[157],"dimensions":[158],"by":[159],"28":[160],"percentage":[161],"points,":[162],"slightly":[164],"improving":[165],"QA":[172],"benchmarks.":[173]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
