{"id":"https://openalex.org/W7123633229","doi":"https://doi.org/10.48550/arxiv.2601.07359","title":"Seeing Right but Saying Wrong: Inter- and Intra-Layer Refinement in MLLMs without Training","display_name":"Seeing Right but Saying Wrong: Inter- and Intra-Layer Refinement in MLLMs without Training","publication_year":2026,"publication_date":"2026-01-12","ids":{"openalex":"https://openalex.org/W7123633229","doi":"https://doi.org/10.48550/arxiv.2601.07359"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.07359","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.07359","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.07359","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Song, Shezheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Song, Shezheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122991157","display_name":"Shasha Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Shasha","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5122980164","display_name":"Jie Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Jie","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9635999798774719,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9635999798774719,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.004900000058114529,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.004800000227987766,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.6061999797821045},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.5688999891281128},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5277000069618225},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4666000008583069},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.4569000005722046},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.4187000095844269},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.3587999939918518}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.65420001745224},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.6061999797821045},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.5688999891281128},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5277000069618225},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4666000008583069},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.4569000005722046},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4410000145435333},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.4187000095844269},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3797999918460846},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.3587999939918518},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.351500004529953},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3424000144004822},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.3325999975204468},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.30649998784065247},{"id":"https://openalex.org/C50335755","wikidata":"https://www.wikidata.org/wiki/Q483247","display_name":"Phenomenon","level":2,"score":0.2980000078678131},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2851000130176544},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2782999873161316},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.2775000035762787},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.26919999718666077}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.07359","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.07359","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.07359","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.07359","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.4745038151741028,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Large":[1],"Language":[2],"Models":[3],"(MLLMs)":[4],"have":[5],"demonstrated":[6],"strong":[7],"capabilities":[8],"across":[9,159],"a":[10,21,48,61,80],"variety":[11],"of":[12,96],"vision-language":[13],"tasks.":[14],"However,":[15],"their":[16],"internal":[17],"reasoning":[18],"often":[19],"exhibits":[20],"critical":[22],"inconsistency:":[23],"although":[24],"deeper":[25],"layers":[26,120],"may":[27],"attend":[28],"to":[29],"the":[30,52,87,108,111,123,153],"correct":[31,112],"visual":[32,88],"regions,":[33,141],"final":[34],"predictions":[35],"are":[36],"frequently":[37],"misled":[38],"by":[39,115],"noisy":[40],"attention":[41,125,135,144],"from":[42],"earlier":[43],"layers.":[44],"This":[45],"results":[46],"in":[47,110],"disconnect":[49],"between":[50,119],"what":[51,57],"model":[53,157],"internally":[54],"understands":[55],"and":[56,155,174],"it":[58,67,71],"ultimately":[59],"expresses,":[60],"phenomenon":[62],"we":[63,77],"describe":[64],"as":[65],"seeing":[66],"right":[68],"but":[69],"saying":[70],"wrong.":[72],"To":[73],"address":[74],"this":[75],"issue,":[76],"propose":[78],"DualPD,":[79],"dual-perspective":[81],"decoding":[82],"refinement":[83],"strategy":[84],"that":[85,121,137,164],"enhances":[86],"understanding":[89],"without":[90,169],"any":[91],"additional":[92],"training.":[93],"DualPD":[94,165],"consists":[95],"two":[97],"components.":[98],"(1)":[99],"The":[100,128,176],"layer-wise":[101],"attention-guided":[102],"contrastive":[103],"logits":[104,118],"module":[105,132],"captures":[106],"how":[107],"belief":[109],"answer":[113],"evolves":[114],"comparing":[116],"output":[117],"exhibit":[122],"largest":[124],"shift.":[126],"(2)":[127],"head-wise":[129],"information":[130],"filtering":[131],"suppresses":[133],"low-contribution":[134],"heads":[136],"focus":[138],"on":[139,151],"irrelevant":[140],"thereby":[142],"improving":[143],"quality":[145],"within":[146],"each":[147],"layer.":[148],"Experiments":[149],"conducted":[150],"both":[152],"LLaVA":[154],"Qwen-VL":[156],"families":[158],"multiple":[160],"multimodal":[161],"benchmarks":[162],"demonstrate":[163],"consistently":[166],"improves":[167],"accuracy":[168],"training,":[170],"confirming":[171],"its":[172],"effectiveness":[173],"generalizability.":[175],"code":[177],"will":[178],"be":[179],"released":[180],"upon":[181],"publication.":[182]},"counts_by_year":[],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2026-01-14T00:00:00"}
