{"id":"https://openalex.org/W7161704135","doi":"https://doi.org/10.48550/arxiv.2605.17065","title":"PyraVid: Hierarchical Multimodal Memory for Long-Horizon Video Reasoning","display_name":"PyraVid: Hierarchical Multimodal Memory for Long-Horizon Video Reasoning","publication_year":2026,"publication_date":"2026-05-16","ids":{"openalex":"https://openalex.org/W7161704135","doi":"https://doi.org/10.48550/arxiv.2605.17065"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.17065","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.17065","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.17065","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132755543","display_name":"Sikuan Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Sikuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136482718","display_name":"Sicheng Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Sicheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136474416","display_name":"Haotong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Haotong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054893267","display_name":"Ercong Nie","orcid":"https://orcid.org/0000-0003-1453-4460"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nie, Ercong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136471422","display_name":"Yilun Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yilun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136470283","display_name":"Jinhe Bi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bi, Jinhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136497653","display_name":"Yingjie Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Yingjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040544054","display_name":"Susanna Schwarzmann","orcid":"https://orcid.org/0000-0002-3705-7559"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schwarzmann, Susanna","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089631314","display_name":"Riccardo Trivisonno","orcid":"https://orcid.org/0000-0003-4190-5781"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Trivisonno, Riccardo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136475833","display_name":"Volker Tresp","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tresp, Volker","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136462264","display_name":"Yunpu Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Yunpu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9286999702453613,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9286999702453613,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.016100000590085983,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.00800000037997961,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/episodic-memory","display_name":"Episodic memory","score":0.554099977016449},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.5091000199317932},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.47589999437332153},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.4336000084877014},{"id":"https://openalex.org/keywords/semantic-memory","display_name":"Semantic memory","score":0.42410001158714294},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.42260000109672546},{"id":"https://openalex.org/keywords/cognitive-architecture","display_name":"Cognitive architecture","score":0.4050000011920929},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.35920000076293945},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.350600004196167}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7538999915122986},{"id":"https://openalex.org/C88576662","wikidata":"https://www.wikidata.org/wiki/Q18646","display_name":"Episodic memory","level":3,"score":0.554099977016449},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.5091000199317932},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.47589999437332153},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4542999863624573},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.4336000084877014},{"id":"https://openalex.org/C197914299","wikidata":"https://www.wikidata.org/wiki/Q18650","display_name":"Semantic memory","level":3,"score":0.42410001158714294},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.42260000109672546},{"id":"https://openalex.org/C20854674","wikidata":"https://www.wikidata.org/wiki/Q4386060","display_name":"Cognitive architecture","level":3,"score":0.4050000011920929},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.35920000076293945},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3481000065803528},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.34209999442100525},{"id":"https://openalex.org/C2985957978","wikidata":"https://www.wikidata.org/wiki/Q492","display_name":"Human memory","level":3,"score":0.33070001006126404},{"id":"https://openalex.org/C144986985","wikidata":"https://www.wikidata.org/wiki/Q871236","display_name":"Hierarchical database model","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C82687282","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Auxiliary memory","level":2,"score":0.3050000071525574},{"id":"https://openalex.org/C30390489","wikidata":"https://www.wikidata.org/wiki/Q4680748","display_name":"Adaptive memory","level":3,"score":0.304500013589859},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.30379998683929443},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.3009999990463257},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.2799000144004822},{"id":"https://openalex.org/C142575187","wikidata":"https://www.wikidata.org/wiki/Q3358290","display_name":"Pyramid (geometry)","level":2,"score":0.2791999876499176},{"id":"https://openalex.org/C112049663","wikidata":"https://www.wikidata.org/wiki/Q18608","display_name":"Explicit memory","level":4,"score":0.2784999907016754},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2754000127315521},{"id":"https://openalex.org/C2779086471","wikidata":"https://www.wikidata.org/wiki/Q2051704","display_name":"False memory","level":3,"score":0.2732999920845032},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.26969999074935913},{"id":"https://openalex.org/C21963081","wikidata":"https://www.wikidata.org/wiki/Q11337567","display_name":"Working memory","level":3,"score":0.2590000033378601},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2547000050544739},{"id":"https://openalex.org/C12186640","wikidata":"https://www.wikidata.org/wiki/Q6815743","display_name":"Memory model","level":3,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.17065","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.17065","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.17065","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.17065","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Memory":[0],"has":[1,23],"become":[2],"an":[3],"increasingly":[4],"important":[5],"component":[6],"of":[7,107,143],"agentic":[8],"systems,":[9],"as":[10],"these":[11],"systems":[12],"are":[13],"expected":[14],"to":[15],"reason":[16],"over":[17],"long-term":[18],"experience.":[19],"However,":[20],"prior":[21],"work":[22],"largely":[24],"focused":[25],"on":[26,122],"unimodal":[27,43],"memory,":[28],"leaving":[29],"multimodal":[30,45,68,145],"memory":[31,46,69,90,100,146],"relatively":[32],"underexplored":[33],"despite":[34],"its":[35],"central":[36],"role":[37],"in":[38],"real-world":[39],"applications.":[40],"Compared":[41],"with":[42,102,110],"settings,":[44],"introduces":[47],"additional":[48],"challenges,":[49],"including":[50],"heterogeneous":[51],"input":[52],"integration,":[53],"person-centric":[54],"information":[55],"alignment,":[56],"and":[57,92,137],"evidence":[58,94],"aggregation":[59],"across":[60,133],"different":[61],"granularities.":[62],"We":[63],"present":[64],"PyraVid,":[65],"a":[66,84],"hierarchical":[67,144],"framework":[70],"inspired":[71],"by":[72],"Event":[73],"Segmentation":[74],"Theory":[75],"from":[76],"cognitive":[77],"science.":[78],"PyraVid":[79,129],"organizes":[80],"long":[81],"videos":[82],"into":[83],"coarse-to-fine":[85],"pyramid":[86],"structure,":[87],"enabling":[88],"structured":[89],"access":[91],"effective":[93],"aggregation.":[95],"It":[96],"further":[97],"supports":[98],"structure-guided":[99],"expansion":[101],"pruning,":[103],"allowing":[104],"the":[105,141],"retrieval":[106],"related":[108],"events":[109],"strong":[111],"causal":[112],"connectivity":[113],"but":[114],"low":[115],"semantic":[116],"similarity":[117],"while":[118],"reducing":[119],"noise.":[120],"Experiments":[121],"multiple":[123],"long-video":[124],"understanding":[125],"benchmarks":[126],"show":[127],"that":[128],"consistently":[130],"improves":[131],"performance":[132],"datasets,":[134],"model":[135],"scales,":[136],"question":[138],"types,":[139],"highlighting":[140],"effectiveness":[142],"for":[147],"long-horizon":[148],"reasoning.":[149]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
