{"id":"https://openalex.org/W7138234034","doi":"https://doi.org/10.1609/aaai.v40i37.40431","title":"Causal Tracing of Object Representations in Large Vision Language Models: Mechanistic Interpretability and Hallucination Mitigation","display_name":"Causal Tracing of Object Representations in Large Vision Language Models: Mechanistic Interpretability and Hallucination Mitigation","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138234034","doi":"https://doi.org/10.1609/aaai.v40i37.40431"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i37.40431","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i37.40431","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i37.40431","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129745320","display_name":"Qiming Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Qiming Li","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129716463","display_name":"Zekai Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zekai Ye","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129698670","display_name":"Xiaocheng Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaocheng Feng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085410631","display_name":"Weihong Zhong","orcid":"https://orcid.org/0000-0002-5673-2222"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weihong Zhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111073904","display_name":"Weitao Ma","orcid":"https://orcid.org/0009-0007-8631-3858"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weitao Ma","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123019482","display_name":"Xiachong Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiachong Feng","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5129745320"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.57196122,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"37","first_page":"31645","last_page":"31653"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8248999714851379,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8248999714851379,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.06629999727010727,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.03150000050663948,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.9276999831199646},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6327999830245972},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5533999800682068},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.54339998960495},{"id":"https://openalex.org/keywords/tracing","display_name":"Tracing","score":0.5397999882698059},{"id":"https://openalex.org/keywords/causal-model","display_name":"Causal model","score":0.46219998598098755},{"id":"https://openalex.org/keywords/causal-chain","display_name":"Causal chain","score":0.45730000734329224},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.43810001015663147},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.42500001192092896}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.9276999831199646},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7178000211715698},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6327999830245972},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6291999816894531},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5533999800682068},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.54339998960495},{"id":"https://openalex.org/C138673069","wikidata":"https://www.wikidata.org/wiki/Q322229","display_name":"Tracing","level":2,"score":0.5397999882698059},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4726000130176544},{"id":"https://openalex.org/C11671645","wikidata":"https://www.wikidata.org/wiki/Q5054567","display_name":"Causal model","level":2,"score":0.46219998598098755},{"id":"https://openalex.org/C79897977","wikidata":"https://www.wikidata.org/wiki/Q5054568","display_name":"Causal chain","level":2,"score":0.45730000734329224},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.43810001015663147},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.42500001192092896},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.3885999917984009},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.38690000772476196},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.3813000023365021},{"id":"https://openalex.org/C158600405","wikidata":"https://www.wikidata.org/wiki/Q5054566","display_name":"Causal inference","level":2,"score":0.37770000100135803},{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.3610000014305115},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35659998655319214},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.32499998807907104},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.3149999976158142},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.30799999833106995},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.29490000009536743},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.28630000352859497},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2775000035762787},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.26930001378059387},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.26269999146461487}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i37.40431","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i37.40431","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i37.40431","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i37.40431","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Despite":[0],"the":[1,9,30,42,48,72,84,113,120,142],"remarkable":[2],"advancements":[3],"of":[4,33,44,50,87,119,146],"Large":[5],"Vision-Language":[6],"Models":[7],"(LVLMs),":[8],"mechanistic":[10],"interpretability":[11],"remains":[12],"underexplored.":[13],"Existing":[14],"analyses":[15],"are":[16],"insufficiently":[17],"comprehensive":[18],"and":[19,24,29,47,89,103,144,179,184,194,205],"lack":[20],"examination":[21],"covering":[22,83],"visual":[23,76,88,147,166],"textual":[25,90],"tokens,":[26,91],"model":[27,45,94],"components,":[28],"full":[31,85],"range":[32,86],"layers.":[34,109],"This":[35],"limitation":[36],"restricts":[37],"actionable":[38],"insights":[39],"to":[40,115],"improve":[41],"faithfulness":[43],"output":[46],"development":[49],"downstream":[51],"tasks,":[52],"such":[53],"as":[54],"hallucination":[55],"mitigation.":[56],"To":[57],"address":[58],"this":[59],"limitation,":[60],"we":[61,154],"introduce":[62],"Fine-grained":[63],"Cross-modal":[64],"Causal":[65],"Tracing":[66],"(FCCT)":[67],"framework,":[68],"which":[69],"systematically":[70],"quantifies":[71],"causal":[73],"effects":[74],"on":[75,151,173],"object":[77,148,167],"perception.":[78],"FCCT":[79],"conducts":[80],"fine-grained":[81],"analysis":[82,111],"three":[92],"core":[93],"components":[95,178],"including":[96],"multi-head":[97],"self-attention":[98],"(MHSA),":[99],"feed-forward":[100],"networks":[101],"(FFNs),":[102],"hidden":[104],"states,":[105],"across":[106,189],"all":[107],"decoder":[108],"Our":[110],"is":[112],"first":[114],"demonstrate":[116,196],"that":[117,164],"MHSAs":[118],"last":[121],"token":[122],"in":[123,130],"middle":[124],"layers":[125],"play":[126],"a":[127,137,160],"critical":[128],"role":[129],"aggregating":[131],"cross-modal":[132,174],"information,":[133],"while":[134,201],"FFNs":[135],"exhibit":[136],"three-stage":[138],"hierarchical":[139],"progression":[140],"for":[141],"storage":[143],"transfer":[145],"representations.":[149],"Building":[150],"these":[152],"insights,":[153],"propose":[155],"Intermediate":[156],"Representation":[157],"Injection":[158],"(IRI),":[159],"training-free":[161],"inference-time":[162],"technique":[163],"reinforces":[165],"information":[168],"flow":[169],"by":[170],"precisely":[171],"intervening":[172],"representations":[175],"at":[176],"specific":[177],"layers,":[180],"thereby":[181],"enhancing":[182],"perception":[183],"mitigating":[185],"hallucination.":[186],"Consistent":[187],"improvements":[188],"five":[190],"widely":[191],"used":[192],"benchmarks":[193],"LVLMs":[195],"IRI":[197],"achieves":[198],"state-of-the-art":[199],"performance,":[200],"preserving":[202],"inference":[203],"speed":[204],"other":[206],"foundational":[207],"performance.":[208]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
