{"id":"https://openalex.org/W7156720581","doi":"https://doi.org/10.48550/arxiv.2604.23198","title":"StoryTR: Narrative-Centric Video Temporal Retrieval with Theory of Mind Reasoning","display_name":"StoryTR: Narrative-Centric Video Temporal Retrieval with Theory of Mind Reasoning","publication_year":2026,"publication_date":"2026-04-25","ids":{"openalex":"https://openalex.org/W7156720581","doi":"https://doi.org/10.48550/arxiv.2604.23198"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.23198","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23198","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.23198","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134761910","display_name":"Xuanyue Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Xuanyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134770641","display_name":"Yuqiang Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Yuqiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016119118","display_name":"Guanqun Bi","orcid":"https://orcid.org/0000-0001-8829-9489"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bi, Guanqun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134817768","display_name":"Jiangping Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Jiangping","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134762056","display_name":"Guibin Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Guibin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9728999733924866,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9728999733924866,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.008100000210106373,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.0032999999821186066,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/narrative","display_name":"Narrative","score":0.7876999974250793},{"id":"https://openalex.org/keywords/meaning","display_name":"Meaning (existential)","score":0.6317999958992004},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5888000130653381},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5205000042915344},{"id":"https://openalex.org/keywords/moment","display_name":"Moment (physics)","score":0.48559999465942383},{"id":"https://openalex.org/keywords/causality","display_name":"Causality (physics)","score":0.47850000858306885},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4390999972820282}],"concepts":[{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.7876999974250793},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6947000026702881},{"id":"https://openalex.org/C2780876879","wikidata":"https://www.wikidata.org/wiki/Q3054749","display_name":"Meaning (existential)","level":2,"score":0.6317999958992004},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5888000130653381},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5766000151634216},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5205000042915344},{"id":"https://openalex.org/C179254644","wikidata":"https://www.wikidata.org/wiki/Q13222844","display_name":"Moment (physics)","level":2,"score":0.48559999465942383},{"id":"https://openalex.org/C64357122","wikidata":"https://www.wikidata.org/wiki/Q1149766","display_name":"Causality (physics)","level":2,"score":0.47850000858306885},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.45179998874664307},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4390999972820282},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.40560001134872437},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.3756999969482422},{"id":"https://openalex.org/C2776639384","wikidata":"https://www.wikidata.org/wiki/Q840396","display_name":"Ideal (ethics)","level":2,"score":0.3709000051021576},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C96199812","wikidata":"https://www.wikidata.org/wiki/Q2145290","display_name":"Mental representation","level":3,"score":0.34709998965263367},{"id":"https://openalex.org/C2779560602","wikidata":"https://www.wikidata.org/wiki/Q639219","display_name":"Theory of mind","level":3,"score":0.3434000015258789},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.33959999680519104},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3260999917984009},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.31349998712539673},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.3003000020980835},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.27880001068115234}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.23198","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23198","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.23198","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23198","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Current":[0],"video":[1,58],"moment":[2,59],"retrieval":[3,60],"excels":[4],"at":[5],"action-centric":[6],"tasks":[7],"but":[8,19],"struggles":[9],"with":[10,94,141],"narrative":[11,48,69,148],"content.":[12],"Models":[13],"can":[14],"see":[15],"\\textit{what":[16],"is":[17,109,112],"happening}":[18],"fail":[20],"to":[21,41,114],"reason":[22],"\\textit{why":[23],"it":[24],"matters}.":[25],"This":[26],"semantic":[27],"gap":[28],"stems":[29],"from":[30,50,68],"the":[31,38,56,102,154,157],"lack":[32],"of":[33,35,156],"\\textbf{Theory":[34],"Mind":[36],"(ToM)}:":[37],"cognitive":[39],"ability":[40],"infer":[42],"implicit":[43],"intentions,":[44],"mental":[45],"states,":[46],"and":[47],"causality":[49],"surface-level":[51],"observations.":[52],"We":[53],"introduce":[54],"\\textbf{StoryTR},":[55],"first":[57],"benchmark":[61],"requiring":[62],"ToM":[63,111,144],"reasoning,":[64,149],"comprising":[65],"8.1k":[66],"samples":[67],"short-form":[70],"videos":[71,74],"(shorts/reels).":[72],"These":[73],"present":[75],"an":[76,133],"ideal":[77],"testbed.":[78],"Their":[79],"high":[80],"information":[81],"density":[82],"encodes":[83],"meaning":[84],"through":[85],"subtle":[86],"multimodal":[87,106],"cues.":[88],"For":[89],"instance,":[90],"a":[91,95,117],"glance":[92,103],"paired":[93],"sigh":[96],"carries":[97],"entirely":[98],"different":[99],"semantics":[100],"than":[101,190],"alone.":[104],"Yet":[105],"perception":[107],"alone":[108],"insufficient;":[110],"required":[113],"decode":[115],"that":[116,137,184],"character":[118],"``smiling''":[119],"may":[120],"actually":[121],"be":[122],"``concealing":[123],"hostility.''":[124],"To":[125],"teach":[126],"models":[127],"this":[128],"reasoning":[129,158,186],"capability,":[130],"we":[131],"propose":[132],"\\textbf{Agentic":[134],"Data":[135],"Pipeline}":[136],"generates":[138],"training":[139],"data":[140],"explicit":[142],"three-tier":[143],"chains":[145],"(intent":[146],"decoding,":[147],"boundary":[150],"localization).":[151],"Experiments":[152],"reveal":[153],"severity":[155],"gap:":[159],"Gemini-3.0-Pro":[160],"achieves":[161],"only":[162],"0.53":[163],"Avg":[164],"IoU":[165,180],"on":[166,174],"StoryTR.":[167],"However,":[168],"our":[169],"7B":[170],"\\textbf{Shorts-Moment}":[171],"model,":[172],"trained":[173],"ToM-guided":[175],"data,":[176],"improves":[177],"+15.1\\%":[178],"relative":[179],"over":[181],"baselines,":[182],"demonstrating":[183],"\\textit{narrative":[185],"capability":[187],"matters":[188],"more":[189],"parameter":[191],"scale}.":[192]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-29T00:00:00"}
