{"id":"https://openalex.org/W7161738154","doi":"https://doi.org/10.48550/arxiv.2605.16481","title":"Visual Agentic Memory: Enabling Online Long Video Understanding via Online Indexing, Hierarchical Memory, and Agentic Retrieval","display_name":"Visual Agentic Memory: Enabling Online Long Video Understanding via Online Indexing, Hierarchical Memory, and Agentic Retrieval","publication_year":2026,"publication_date":"2026-05-15","ids":{"openalex":"https://openalex.org/W7161738154","doi":"https://doi.org/10.48550/arxiv.2605.16481"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.16481","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16481","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.16481","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013745132","display_name":"A Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Aiden Yiliu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064892111","display_name":"Nels Numan","orcid":"https://orcid.org/0000-0003-2931-7653"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Numan, Nels","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5071447827","display_name":"Anthony Steed","orcid":"https://orcid.org/0000-0001-9034-3020"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Steed, Anthony","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7682999968528748,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7682999968528748,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.020099999383091927,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.01600000075995922,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6220999956130981},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.6033999919891357},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4408999979496002},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.43299999833106995},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.42410001158714294},{"id":"https://openalex.org/keywords/state","display_name":"State (computer science)","score":0.374099999666214},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.35339999198913574},{"id":"https://openalex.org/keywords/video-retrieval","display_name":"Video retrieval","score":0.3452000021934509}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7318000197410583},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6220999956130981},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.6033999919891357},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4408999979496002},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.43299999833106995},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.42410001158714294},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.4034999907016754},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.3939000070095062},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.374099999666214},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.36070001125335693},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.35339999198913574},{"id":"https://openalex.org/C2983174267","wikidata":"https://www.wikidata.org/wiki/Q3775098","display_name":"Video retrieval","level":2,"score":0.3452000021934509},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.34380000829696655},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.32670000195503235},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.30640000104904175},{"id":"https://openalex.org/C58632812","wikidata":"https://www.wikidata.org/wiki/Q7936625","display_name":"Visual short-term memory","level":4,"score":0.30480000376701355},{"id":"https://openalex.org/C88576662","wikidata":"https://www.wikidata.org/wiki/Q18646","display_name":"Episodic memory","level":3,"score":0.3034000098705292},{"id":"https://openalex.org/C24590219","wikidata":"https://www.wikidata.org/wiki/Q18601","display_name":"Long-term memory","level":3,"score":0.29159998893737793},{"id":"https://openalex.org/C2985957978","wikidata":"https://www.wikidata.org/wiki/Q492","display_name":"Human memory","level":3,"score":0.2892000079154968},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.2890999913215637},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.2761000096797943},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2745000123977661},{"id":"https://openalex.org/C2988167200","wikidata":"https://www.wikidata.org/wiki/Q16885149","display_name":"Online video","level":2,"score":0.2718000113964081},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.263700008392334},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.2612999975681305},{"id":"https://openalex.org/C105842133","wikidata":"https://www.wikidata.org/wiki/Q1899679","display_name":"Visual communication","level":2,"score":0.26010000705718994},{"id":"https://openalex.org/C178278151","wikidata":"https://www.wikidata.org/wiki/Q7936607","display_name":"Visual memory","level":3,"score":0.25999999046325684},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.25949999690055847},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.25440001487731934},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.2526000142097473}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.16481","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16481","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.16481","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16481","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Long":[0],"video":[1,144],"understanding":[2,145],"requires":[3],"more":[4],"than":[5,36],"large":[6],"context":[7,74],"windows.":[8],"It":[9],"also":[10],"needs":[11],"a":[12,47,68,88],"memory":[13,150],"mechanism":[14],"that":[15,71,142],"decides":[16],"what":[17],"visual":[18,149],"evidence":[19,57,66,85],"to":[20,134],"retain,":[21],"keeps":[22],"it":[23],"searchable":[24],"over":[25,105,126],"long":[26],"horizons,":[27],"and":[28,82,155],"grounds":[29],"later":[30],"reasoning":[31],"in":[32,67],"recoverable":[33],"observations":[34],"rather":[35],"compressed":[37],"latent":[38],"state":[39],"alone.":[40],"We":[41],"propose":[42],"Visual":[43],"Agentic":[44,78],"Memory":[45,63],"(VAM),":[46],"training-free":[48],"framework":[49],"with":[50,75,136],"three":[51],"components.":[52],"Online":[53],"Indexing":[54],"supports":[55],"selective":[56],"retention":[58],"under":[59],"streaming":[60],"constraints.":[61],"Hierarchical":[62],"organises":[64],"retained":[65],"Parallel":[69],"Representation":[70],"aligns":[72],"temporal":[73],"spatial":[76],"observations.":[77],"Retrieval":[79],"searches,":[80],"inspects,":[81],"verifies":[83],"candidate":[84],"before":[86],"producing":[87],"grounded":[89],"answer.":[90],"On":[91,117],"OVO-Bench,":[92],"VAM":[93,129],"achieves":[94],"the":[95,109,118],"highest":[96],"RT+BT":[97],"average":[98],"(68.41)":[99],"across":[100],"all":[101],"reported":[102],"baselines,":[103],"improving":[104],"end-to-end":[106],"use":[107],"of":[108,121],"same":[110],"underlying":[111],"MLLM":[112],"(Gemini":[113],"3":[114],"Flash,":[115],"67.46).":[116],"month-scale":[119],"split":[120],"MM-Lifelong":[122],"train@month":[123],"(105.6":[124],"hours":[125],"51":[127],"days),":[128],"reaches":[130],"17.11%,":[131],"second":[132],"only":[133],"ReMA":[135],"GPT-5":[137],"(17.62%).":[138],"These":[139],"results":[140],"suggest":[141],"long-horizon":[143],"benefits":[146],"from":[147],"treating":[148],"as":[151],"an":[152],"explicit,":[153],"inspectable,":[154],"queryable":[156],"substrate.":[157],"Code":[158],"is":[159],"available":[160],"at":[161],"https://github.com/yiliu-li/Visual-Agentic-Memory.":[162]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
