{"id":"https://openalex.org/W7162118248","doi":"https://doi.org/10.48550/arxiv.2605.22072","title":"Faithful-MR1: Faithful Multimodal Reasoning via Anchoring and Reinforcing Visual Attention","display_name":"Faithful-MR1: Faithful Multimodal Reasoning via Anchoring and Reinforcing Visual Attention","publication_year":2026,"publication_date":"2026-05-21","ids":{"openalex":"https://openalex.org/W7162118248","doi":"https://doi.org/10.48550/arxiv.2605.22072"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.22072","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.22072","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.22072","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052887619","display_name":"Changyuan Tian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Changyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136785963","display_name":"Zhicong Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Zhicong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136739034","display_name":"Huaxing Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Huaxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136752570","display_name":"Xiang Wang","orcid":"https://orcid.org/0009-0008-4746-7384"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136788033","display_name":"Shuai Li","orcid":"https://orcid.org/0009-0001-1784-1661"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Shuai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136781708","display_name":"Chen, Yu, 1979-","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136792948","display_name":"Wenqian Lv","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lv, Wenqian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136789262","display_name":"Zichuan Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Zichuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136811125","display_name":"Juncheng Diao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Diao, Juncheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136763638","display_name":"Deheng Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Deheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9362000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9362000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.006200000178068876,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.006099999882280827,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.6283000111579895},{"id":"https://openalex.org/keywords/counterfactual-thinking","display_name":"Counterfactual thinking","score":0.609000027179718},{"id":"https://openalex.org/keywords/anchoring","display_name":"Anchoring","score":0.5669000148773193},{"id":"https://openalex.org/keywords/visual-attention","display_name":"Visual attention","score":0.5217000246047974},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.4189000129699707},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.3698999881744385},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.3571999967098236},{"id":"https://openalex.org/keywords/inattentional-blindness","display_name":"Inattentional blindness","score":0.34869998693466187}],"concepts":[{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.6283000111579895},{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.609000027179718},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.5913000106811523},{"id":"https://openalex.org/C18483071","wikidata":"https://www.wikidata.org/wiki/Q168432","display_name":"Anchoring","level":2,"score":0.5669000148773193},{"id":"https://openalex.org/C2986089797","wikidata":"https://www.wikidata.org/wiki/Q6501338","display_name":"Visual attention","level":3,"score":0.5217000246047974},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.49630001187324524},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.4426000118255615},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.4189000129699707},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.41510000824928284},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39239999651908875},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.3698999881744385},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.3571999967098236},{"id":"https://openalex.org/C93173128","wikidata":"https://www.wikidata.org/wiki/Q287827","display_name":"Inattentional blindness","level":3,"score":0.34869998693466187},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.3375000059604645},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.29809999465942383},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2964000105857849},{"id":"https://openalex.org/C2780154230","wikidata":"https://www.wikidata.org/wiki/Q513420","display_name":"Undo","level":2,"score":0.28679999709129333},{"id":"https://openalex.org/C2777379011","wikidata":"https://www.wikidata.org/wiki/Q938545","display_name":"Implicit learning","level":3,"score":0.28040000796318054},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.2784999907016754},{"id":"https://openalex.org/C137878579","wikidata":"https://www.wikidata.org/wiki/Q9636076","display_name":"Joint attention","level":3,"score":0.2718999981880188},{"id":"https://openalex.org/C2779321571","wikidata":"https://www.wikidata.org/wiki/Q7936605","display_name":"Visual learning","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C158495155","wikidata":"https://www.wikidata.org/wiki/Q2369151","display_name":"Visual search","level":2,"score":0.2597000002861023},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.25929999351501465}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.22072","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.22072","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.22072","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.22072","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reinforcement":[0],"learning":[1],"with":[2],"verifiable":[3],"rewards":[4],"(RLVR)":[5],"has":[6],"emerged":[7],"as":[8],"a":[9,35,101,129],"promising":[10],"paradigm":[11],"for":[12],"advancing":[13],"complex":[14],"reasoning":[15,171],"in":[16],"large":[17,27],"language":[18,28],"models,":[19],"and":[20,44,74,106,177],"recent":[21,169],"work":[22],"extends":[23],"RLVR":[24],"to":[25,53,110],"multimodal":[26,57,116,170],"models":[29],"(MLLMs).":[30],"This":[31],"transfer,":[32],"however,":[33],"surfaces":[34],"faithfulness":[36],"challenge:":[37],"faithful":[38,45,75,115,146],"perception":[39,61,122],"of":[40,47,114],"task-relevant":[41],"visual":[42,108,157],"evidence":[43,49,87],"use":[46,76,147],"that":[48,104,155,166],"during":[50,92],"reasoning,":[51],"leading":[52],"unsatisfactory":[54],"gains":[55],"on":[56,65,71,173],"benchmarks.":[58],"Specifically,":[59],"existing":[60],"supervision":[62],"often":[63],"operates":[64],"textual":[66,140],"descriptions":[67],"rather":[68,137],"than":[69,138],"natively":[70],"image":[72,135,150],"regions,":[73],"is":[77,88],"largely":[78],"overlooked,":[79],"exposing":[80],"the":[81],"perception-reasoning":[82],"disconnect":[83],"where":[84,159],"correctly":[85],"perceived":[86],"dropped":[89],"or":[90],"contradicted":[91],"reasoning.":[93,117],"To":[94],"close":[95],"these":[96],"gaps,":[97],"we":[98],"propose":[99],"Faithful-MR1,":[100],"training":[102,184],"framework":[103],"anchors":[105],"reinforces":[107],"attention":[109,132,158],"address":[111],"both":[112,174],"halves":[113],"The":[118,142],"Anchoring":[119],"stage":[120,144],"turns":[121],"into":[123],"an":[124],"explicit":[125],"pre-reasoning":[126],"subtask,":[127],"supervising":[128],"dedicated":[130],"token's":[131],"directly":[133],"against":[134],"regions":[136],"through":[139,148],"descriptions.":[141],"Reinforcing":[143],"exposes":[145],"counterfactual":[149],"intervention,":[151],"rewarding":[152],"answer-correct":[153],"trajectories":[154],"concentrate":[156],"vision":[160],"causally":[161],"matters.":[162],"Extensive":[163],"experiments":[164],"demonstrate":[165],"Faithful-MR1":[167],"outperforms":[168],"baselines":[172],"Qwen2.5-VL-Instruct":[175],"3B":[176],"7B":[178],"backbones":[179],"while":[180],"using":[181],"substantially":[182],"less":[183],"data.":[185]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-23T00:00:00"}
