{"id":"https://openalex.org/W4417286694","doi":"https://doi.org/10.48550/arxiv.2511.17490","title":"Video-R4: Reinforcing Text-Rich Video Reasoning with Visual Rumination","display_name":"Video-R4: Reinforcing Text-Rich Video Reasoning with Visual Rumination","publication_year":2025,"publication_date":"2025-11-21","ids":{"openalex":"https://openalex.org/W4417286694","doi":"https://doi.org/10.48550/arxiv.2511.17490"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2511.17490","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.17490","pdf_url":"https://arxiv.org/pdf/2511.17490","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2511.17490","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101981298","display_name":"Yubao Tang","orcid":"https://orcid.org/0009-0003-8010-3404"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Tang, Yolo Y.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067600053","display_name":"Daiki Shimada","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shimada, Daiki","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035263361","display_name":"Hua Hang","orcid":"https://orcid.org/0000-0003-1654-0696"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hua, Hang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091518548","display_name":"Chao Huang","orcid":"https://orcid.org/0009-0003-3740-4500"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Chao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107854589","display_name":"Jing Bi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bi, Jing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052325109","display_name":"Rog\u00e9rio Feris","orcid":"https://orcid.org/0000-0001-6399-0679"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feris, Rogerio","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5064805926","display_name":"Chenliang Xu","orcid":"https://orcid.org/0000-0002-2183-822X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Chenliang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5101981298"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.949400007724762,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.949400007724762,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.00800000037997961,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.007000000216066837,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rumination","display_name":"Rumination","score":0.7347000241279602},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4643000066280365},{"id":"https://openalex.org/keywords/eye-tracking","display_name":"Eye tracking","score":0.4578999876976013},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4357999861240387},{"id":"https://openalex.org/keywords/zoom","display_name":"Zoom","score":0.3982999920845032},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.396699994802475},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3741999864578247},{"id":"https://openalex.org/keywords/reading","display_name":"Reading (process)","score":0.3416000008583069}],"concepts":[{"id":"https://openalex.org/C2776060345","wikidata":"https://www.wikidata.org/wiki/Q1411912","display_name":"Rumination","level":3,"score":0.7347000241279602},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6155999898910522},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.531000018119812},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.5033000111579895},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4643000066280365},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.4578999876976013},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4357999861240387},{"id":"https://openalex.org/C124913957","wikidata":"https://www.wikidata.org/wiki/Q1232548","display_name":"Zoom","level":3,"score":0.3982999920845032},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.396699994802475},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.37940001487731934},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3741999864578247},{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.3416000008583069},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.31450000405311584},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3131999969482422},{"id":"https://openalex.org/C2909432541","wikidata":"https://www.wikidata.org/wiki/Q842175","display_name":"Ruminating","level":4,"score":0.30809998512268066},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3077999949455261},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3043999969959259},{"id":"https://openalex.org/C2779913896","wikidata":"https://www.wikidata.org/wiki/Q7063001","display_name":"Notice","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.2784000039100647},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27379998564720154},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.2732999920845032},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2678000032901764},{"id":"https://openalex.org/C2779321571","wikidata":"https://www.wikidata.org/wiki/Q7936605","display_name":"Visual learning","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C2777413886","wikidata":"https://www.wikidata.org/wiki/Q3276013","display_name":"Fluency","level":2,"score":0.26600000262260437},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.26080000400543213},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.25279998779296875}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2511.17490","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.17490","pdf_url":"https://arxiv.org/pdf/2511.17490","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2511.17490","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.17490","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2511.17490","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.17490","pdf_url":"https://arxiv.org/pdf/2511.17490","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Understanding":[0],"text-rich":[1],"videos":[2],"requires":[3],"reading":[4],"small,":[5],"transient":[6],"textual":[7],"cues":[8],"that":[9,58,101,139],"often":[10],"demand":[11],"repeated":[12],"inspection.":[13],"Yet":[14],"most":[15],"video":[16,55,136],"QA":[17],"models":[18],"rely":[19],"on":[20,31,123],"single-pass":[21],"perception":[22],"over":[23],"fixed":[24],"frames,":[25,64],"leading":[26],"to":[27,107,128],"hallucinations":[28],"and":[29,40,72,89,110,116,125,134],"failures":[30],"fine-grained":[32],"evidence.":[33],"Inspired":[34],"by":[35],"how":[36],"humans":[37],"pause,":[38],"zoom,":[39],"re-read":[41],"critical":[42],"regions,":[43,68],"we":[44],"introduce":[45],"Video-R4":[46],"(Reinforcing":[47],"Text-Rich":[48],"Video":[49],"Reasoning":[50],"with":[51,81],"Visual":[52],"Rumination),":[53],"a":[54,96,104],"reasoning":[56,75],"LMM":[57,106],"performs":[59],"visual":[60,112],"rumination:":[61],"iteratively":[62],"selecting":[63],"zooming":[65],"into":[66],"informative":[67],"re-encoding":[69],"retrieved":[70],"pixels,":[71],"updating":[73],"its":[74],"state.":[76],"We":[77,94],"construct":[78],"two":[79],"datasets":[80],"executable":[82],"rumination":[83,98,141],"trajectories:":[84],"Video-R4-CoT-17k":[85],"for":[86,91,146],"supervised":[87],"practice":[88],"Video-R4-RL-30k":[90],"reinforcement":[92],"learning.":[93],"propose":[95],"multi-stage":[97],"learning":[99],"framework":[100],"progressively":[102],"finetunes":[103],"7B":[105],"learn":[108],"atomic":[109],"mixing":[111],"operations":[113],"via":[114],"SFT":[115],"GRPO-based":[117],"RL.":[118],"Video-R4-7B":[119],"achieves":[120],"state-of-the-art":[121],"results":[122],"M4-ViteVQA":[124],"further":[126],"generalizes":[127],"multi-page":[129],"document":[130],"QA,":[131,133,137],"slides":[132],"generic":[135],"demonstrating":[138],"iterative":[140],"is":[142],"an":[143],"effective":[144],"paradigm":[145],"pixel-grounded":[147],"multimodal":[148],"reasoning.":[149],"Project":[150],"Page:":[151],"https://yunlong10.github.io/Video-R4/":[152]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-25T00:00:00"}
