{"id":"https://openalex.org/W4416251781","doi":"https://doi.org/10.1109/waspaa66052.2025.11230922","title":"Bridging Ears and Eyes: Analyzing Audio and Visual Large Language Models to Humans in Visible Sound Recognition and Reducing Their Sensory Gap via Cross-Modal Distillation","display_name":"Bridging Ears and Eyes: Analyzing Audio and Visual Large Language Models to Humans in Visible Sound Recognition and Reducing Their Sensory Gap via Cross-Modal Distillation","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W4416251781","doi":"https://doi.org/10.1109/waspaa66052.2025.11230922"},"language":null,"primary_location":{"id":"doi:10.1109/waspaa66052.2025.11230922","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230922","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109460203","display_name":"Xilin Jiang","orcid":"https://orcid.org/0009-0000-9373-0851"},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Xilin Jiang","raw_affiliation_strings":["Columbia University,NY,USA"],"affiliations":[{"raw_affiliation_string":"Columbia University,NY,USA","institution_ids":["https://openalex.org/I78577930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101600013","display_name":"Junkai Wu","orcid":"https://orcid.org/0000-0003-1541-0267"},"institutions":[{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Junkai Wu","raw_affiliation_strings":["University of Washington,WA,USA"],"affiliations":[{"raw_affiliation_string":"University of Washington,WA,USA","institution_ids":["https://openalex.org/I201448701"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052745933","display_name":"Vishal Choudhari","orcid":"https://orcid.org/0009-0000-5486-5913"},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vishal Choudhari","raw_affiliation_strings":["Columbia University,NY,USA"],"affiliations":[{"raw_affiliation_string":"Columbia University,NY,USA","institution_ids":["https://openalex.org/I78577930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5033351155","display_name":"Nima Mesgarani","orcid":"https://orcid.org/0000-0002-2987-759X"},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nima Mesgarani","raw_affiliation_strings":["Columbia University,NY,USA"],"affiliations":[{"raw_affiliation_string":"Columbia University,NY,USA","institution_ids":["https://openalex.org/I78577930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5109460203"],"corresponding_institution_ids":["https://openalex.org/I78577930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.45536136,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.2540999948978424,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.2540999948978424,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.15649999678134918,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.09619999676942825,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.7893999814987183},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.657800018787384},{"id":"https://openalex.org/keywords/sensory-system","display_name":"Sensory system","score":0.6050999760627747},{"id":"https://openalex.org/keywords/parallels","display_name":"Parallels","score":0.5572999715805054},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.453000009059906},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.38659998774528503},{"id":"https://openalex.org/keywords/stimulus-modality","display_name":"Stimulus modality","score":0.38600000739097595},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.3797999918460846},{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.3725999891757965}],"concepts":[{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.7893999814987183},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.657800018787384},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6244999766349792},{"id":"https://openalex.org/C94487597","wikidata":"https://www.wikidata.org/wiki/Q11101","display_name":"Sensory system","level":2,"score":0.6050999760627747},{"id":"https://openalex.org/C2775922551","wikidata":"https://www.wikidata.org/wiki/Q7135033","display_name":"Parallels","level":2,"score":0.5572999715805054},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.453000009059906},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.38659998774528503},{"id":"https://openalex.org/C26486553","wikidata":"https://www.wikidata.org/wiki/Q371870","display_name":"Stimulus modality","level":3,"score":0.38600000739097595},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.3797999918460846},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3788999915122986},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.3725999891757965},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3571000099182129},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3569999933242798},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.3499999940395355},{"id":"https://openalex.org/C2779918689","wikidata":"https://www.wikidata.org/wiki/Q3771842","display_name":"Stimulus (psychology)","level":2,"score":0.320499986410141},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.31839999556541443},{"id":"https://openalex.org/C203718221","wikidata":"https://www.wikidata.org/wiki/Q491713","display_name":"Sound (geography)","level":2,"score":0.31369999051094055},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.2865000069141388},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.2833000123500824},{"id":"https://openalex.org/C522192633","wikidata":"https://www.wikidata.org/wiki/Q34228","display_name":"Sign language","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.28060001134872437},{"id":"https://openalex.org/C3019767756","wikidata":"https://www.wikidata.org/wiki/Q557399","display_name":"Sound perception","level":3,"score":0.27810001373291016},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.2761000096797943},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.2628999948501587},{"id":"https://openalex.org/C93424556","wikidata":"https://www.wikidata.org/wiki/Q1080996","display_name":"Sensory memory","level":4,"score":0.2542000114917755},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.2526000142097473}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/waspaa66052.2025.11230922","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230922","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W2032635843","https://openalex.org/W2046394129","https://openalex.org/W2113625664","https://openalex.org/W2164364459","https://openalex.org/W2593116425","https://openalex.org/W3015371781","https://openalex.org/W3094550259","https://openalex.org/W3162322471","https://openalex.org/W3196974791","https://openalex.org/W3202670445","https://openalex.org/W4235635492","https://openalex.org/W4312923322","https://openalex.org/W4372266552","https://openalex.org/W4385822985","https://openalex.org/W4390874575","https://openalex.org/W4392172801","https://openalex.org/W4393147243","https://openalex.org/W4412889888","https://openalex.org/W4415800385"],"related_works":[],"abstract_inverted_index":{"Audio":[0],"large":[1],"language":[2],"models":[3],"(LLMs)":[4],"are":[5],"considered":[6],"experts":[7],"at":[8],"recognizing":[9,58],"sound":[10,59,120],"objects,":[11],"yet":[12],"their":[13,32],"performance":[14,75],"relative":[15],"to":[16,29,126,139,145,169],"LLMs":[17,159],"in":[18,57,104,119,134,149,158,173],"other":[19],"sensory":[20,84,156],"modalities,":[21],"such":[22],"as":[23,108,113,123],"visual":[24],"or":[25,35,68],"audio-visual":[26,48],"LLMs,":[27,49],"and":[28,47,53,79,89,111,141,164],"humans":[30,56],"using":[31],"ears,":[33],"eyes,":[34],"both":[36,135],"remains":[37],"unexplored.":[38],"To":[39,91],"investigate":[40],"this,":[41],"we":[42,95],"systematically":[43],"evaluate":[44],"audio,":[45],"visual,":[46],"specifically":[50],"Qwen2-Audio,":[51],"Qwen2-VL,":[52],"Qwen2.5-Omni,":[54],"against":[55],"objects":[60],"of":[61],"different":[62],"classes":[63,121],"from":[64,137,160],"audio-only,":[65],"silent":[66],"video,":[67],"sounded":[69],"video":[70],"inputs.":[71],"We":[72],"uncover":[73],"a":[74,97,130,161,166],"gap":[76,157],"between":[77,86],"Qwen2-Audio":[78,140],"Qwen2-VL":[80,138],"that":[81],"parallels":[82],"the":[83,109,114,127,155],"discrepancy":[85],"human":[87],"ears":[88],"eyes.":[90],"reduce":[92],"this":[93],"gap,":[94],"introduce":[96],"cross-modal":[98],"distillation":[99],"framework,":[100],"where":[101],"an":[102],"LLM":[103],"one":[105],"modality":[106],"serves":[107],"teacher":[110],"another":[112],"student,":[115],"with":[116],"knowledge":[117],"transfer":[118],"predicted":[122],"more":[124],"challenging":[125,150],"student":[128],"by":[129],"heuristic":[131],"model.":[132],"Distillation":[133],"directions,":[136],"vice":[142],"versa,":[143],"leads":[144],"notable":[146],"improvements,":[147],"particularly":[148],"classes.":[151],"This":[152],"work":[153],"highlights":[154],"human-aligned":[162],"perspective":[163],"proposes":[165],"principled":[167],"approach":[168],"enhancing":[170],"modality-specific":[171],"perception":[172],"multimodal":[174],"LLMs.":[175]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
