{"id":"https://openalex.org/W7130693862","doi":"https://doi.org/10.48550/arxiv.2602.17555","title":"GraphThinker: Reinforcing Temporally Grounded Video Reasoning with Event Graph Thinking","display_name":"GraphThinker: Reinforcing Temporally Grounded Video Reasoning with Event Graph Thinking","publication_year":2026,"publication_date":"2026-02-19","ids":{"openalex":"https://openalex.org/W7130693862","doi":"https://doi.org/10.48550/arxiv.2602.17555"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.17555","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5021380222","display_name":"Zixu Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Cheng, Zixu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126508589","display_name":"Da Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Da","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126505860","display_name":"Jian Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Jian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zang, Yuhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zang, Yuhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126484437","display_name":"Wei Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziquan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126484506","display_name":"Shaogang Gong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Shaogang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126700793","display_name":"Wei Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5021380222"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9239000082015991,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9239000082015991,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.020400000736117363,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.010599999688565731,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.6626999974250793},{"id":"https://openalex.org/keywords/causal-reasoning","display_name":"Causal reasoning","score":0.5734999775886536},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.550599992275238},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4713999927043915},{"id":"https://openalex.org/keywords/causal-model","display_name":"Causal model","score":0.4253000020980835},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.42170000076293945},{"id":"https://openalex.org/keywords/video-tracking","display_name":"Video tracking","score":0.4011000096797943},{"id":"https://openalex.org/keywords/commonsense-reasoning","display_name":"Commonsense reasoning","score":0.35179999470710754},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.3375000059604645}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7763000130653381},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.6626999974250793},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.5734999775886536},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.550599992275238},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.544700026512146},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4713999927043915},{"id":"https://openalex.org/C11671645","wikidata":"https://www.wikidata.org/wiki/Q5054567","display_name":"Causal model","level":2,"score":0.4253000020980835},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.42170000076293945},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.4011000096797943},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3621000051498413},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3555000126361847},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.35179999470710754},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.3375000059604645},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3352000117301941},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.32749998569488525},{"id":"https://openalex.org/C163504300","wikidata":"https://www.wikidata.org/wiki/Q2364925","display_name":"Causal structure","level":2,"score":0.32409998774528503},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3165999948978424},{"id":"https://openalex.org/C2988167200","wikidata":"https://www.wikidata.org/wiki/Q16885149","display_name":"Online video","level":2,"score":0.2987000048160553},{"id":"https://openalex.org/C2986089797","wikidata":"https://www.wikidata.org/wiki/Q6501338","display_name":"Visual attention","level":3,"score":0.2957000136375427},{"id":"https://openalex.org/C2780760462","wikidata":"https://www.wikidata.org/wiki/Q7927952","display_name":"Video modeling","level":4,"score":0.29019999504089355},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.2892000079154968},{"id":"https://openalex.org/C2777810175","wikidata":"https://www.wikidata.org/wiki/Q5416730","display_name":"Event structure","level":2,"score":0.27309998869895935},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.27000001072883606},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C2982912361","wikidata":"https://www.wikidata.org/wiki/Q1851867","display_name":"Mental model","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C105842133","wikidata":"https://www.wikidata.org/wiki/Q1899679","display_name":"Visual communication","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2551000118255615},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2506999969482422}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.17555","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.17555","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.17555","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.17555","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Video":[0,131],"reasoning":[1,69,120,146],"requires":[2],"a":[3,100,106,111,143,157,197,206],"fine-grained":[4],"understanding":[5],"of":[6,37,47,110],"the":[7,45,91,151,167,177,192,218],"temporal":[8,29,202],"dependencies":[9],"and":[10,15,44,81,113,139,205],"event-level":[11],"relations":[12],"between":[13],"objects":[14],"events":[16],"in":[17,31,70,186,200,209,212],"videos.":[18],"Current":[19],"Multimodal":[20],"Large":[21],"Language":[22],"Models":[23,54],"(MLLMs)":[24],"are":[25,78],"prone":[26],"to":[27,83,89,117,127,169,172,217],"severe":[28],"hallucinations":[30,39],"video":[32,112,145],"reasoning.":[33,93],"An":[34],"underlying":[35],"cause":[36],"these":[38,75],"is":[40],"weak":[41,152],"visual-temporal":[42],"grounding":[43,116,153],"lack":[46],"explicit":[48,85],"structure":[49],"for":[50,188],"modelling":[51],"event":[52,108],"relations.":[53],"often":[55],"rely":[56],"on":[57],"auxiliary":[58],"text,":[59],"such":[60],"as":[61],"dense":[62],"captions,":[63],"rather":[64],"than":[65],"explicitly":[66],"anchoring":[67],"their":[68],"actual":[71],"visual":[72,115,159,174],"evidence.":[73],"However,":[74],"textual":[76],"representations":[77],"inherently":[79],"unstructured":[80],"fail":[82],"provide":[84],"causal":[86],"constraints":[87],"needed":[88],"guide":[90],"model's":[92],"In":[94],"this":[95],"work,":[96],"we":[97,123,149],"propose":[98],"GraphThinker,":[99],"reinforcement":[101,163],"finetuning":[102,164],"method":[103],"that":[104,135,165],"constructs":[105],"structured":[107,144],"representation":[109],"enforces":[114],"jointly":[118],"reduce":[119],"hallucinations.":[121],"Specifically,":[122],"employ":[124],"an":[125,129,182],"MLLM":[126],"construct":[128],"Event-based":[130],"Scene":[132],"Graph":[133],"(EVSG)":[134],"captures":[136],"both":[137],"intra-":[138],"inter-event":[140],"relations,":[141],"guiding":[142],"process.":[147],"Moreover,":[148],"address":[150],"issue":[154],"by":[155],"introducing":[156],"novel":[158],"attention":[160],"reward":[161],"during":[162],"encourages":[166],"model":[168],"actively":[170],"attend":[171],"reliable":[173],"cues.":[175],"On":[176,191],"RexTime":[178],"dataset,":[179,194],"GraphThinker":[180,195],"achieves":[181,196],"over":[183],"4%":[184],"improvement":[185,199],"IoU=0.3":[187],"moment":[189],"localisation.":[190],"VidHalluc":[193],"9.8%":[198],"reducing":[201,213],"sequence":[203],"hallucination":[204],"7.6%":[207],"gain":[208],"Binary":[210],"QA":[211],"action":[214],"hallucination,":[215],"compared":[216],"state-of-the-art":[219],"methods.":[220]},"counts_by_year":[],"updated_date":"2026-05-15T06:05:50.897203","created_date":"2026-02-21T00:00:00"}
