{"id":"https://openalex.org/W7133740112","doi":"https://doi.org/10.48550/arxiv.2603.03596","title":"MEM: Multi-Scale Embodied Memory for Vision Language Action Models","display_name":"MEM: Multi-Scale Embodied Memory for Vision Language Action Models","publication_year":2026,"publication_date":"2026-03-04","ids":{"openalex":"https://openalex.org/W7133740112","doi":"https://doi.org/10.48550/arxiv.2603.03596"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.03596","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128206076","display_name":"Marcel Torne","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Torne, Marcel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128147851","display_name":"Karl Pertsch","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pertsch, Karl","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064208112","display_name":"Homer Walke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Walke, Homer","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040726532","display_name":"Kyle Vedder","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vedder, Kyle","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128154724","display_name":"Suraj Nair","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nair, Suraj","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018507768","display_name":"Brian Ichter","orcid":"https://orcid.org/0000-0002-6955-6432"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ichter, Brian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020154600","display_name":"Allen Z. Ren","orcid":"https://orcid.org/0000-0001-5306-2844"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ren, Allen Z.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115602056","display_name":"Haohuan Wang","orcid":"https://orcid.org/0000-0002-4842-5388"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Haohuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103244882","display_name":"Jiaming Tang","orcid":"https://orcid.org/0009-0004-4186-6561"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Jiaming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063799945","display_name":"Kyle Stachowicz","orcid":"https://orcid.org/0000-0002-9880-7261"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stachowicz, Kyle","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119785622","display_name":"Karan Dhabalia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dhabalia, Karan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128187855","display_name":"Michael Equi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Equi, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127794092","display_name":"Quan Vuong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vuong, Quan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017985443","display_name":"Jost Tobias Springenberg","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Springenberg, Jost Tobias","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128151577","display_name":"Sergey Levine","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Levine, Sergey","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128206656","display_name":"Chelsea Finn","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Finn, Chelsea","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5080239685","display_name":"Danny Driess","orcid":"https://orcid.org/0000-0002-8258-1659"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Driess, Danny","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":17,"corresponding_author_ids":["https://openalex.org/A5128206076"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7807999849319458,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7807999849319458,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.0689999982714653,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.03620000183582306,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.5814999938011169},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.49779999256134033},{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.4893999993801117},{"id":"https://openalex.org/keywords/semantic-memory","display_name":"Semantic memory","score":0.4648999869823456},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.4255000054836273},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.424699991941452},{"id":"https://openalex.org/keywords/visual-short-term-memory","display_name":"Visual short-term memory","score":0.35670000314712524},{"id":"https://openalex.org/keywords/memory-map","display_name":"Memory map","score":0.3393000066280365},{"id":"https://openalex.org/keywords/episodic-memory","display_name":"Episodic memory","score":0.33640000224113464}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6953999996185303},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.5814999938011169},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5169000029563904},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.49779999256134033},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.4893999993801117},{"id":"https://openalex.org/C197914299","wikidata":"https://www.wikidata.org/wiki/Q18650","display_name":"Semantic memory","level":3,"score":0.4648999869823456},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4456999897956848},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.4255000054836273},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.424699991941452},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.3783999979496002},{"id":"https://openalex.org/C58632812","wikidata":"https://www.wikidata.org/wiki/Q7936625","display_name":"Visual short-term memory","level":4,"score":0.35670000314712524},{"id":"https://openalex.org/C74426580","wikidata":"https://www.wikidata.org/wiki/Q719484","display_name":"Memory map","level":3,"score":0.3393000066280365},{"id":"https://openalex.org/C88576662","wikidata":"https://www.wikidata.org/wiki/Q18646","display_name":"Episodic memory","level":3,"score":0.33640000224113464},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.33059999346733093},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3262999951839447},{"id":"https://openalex.org/C150415221","wikidata":"https://www.wikidata.org/wiki/Q40687","display_name":"Robotic arm","level":2,"score":0.32339999079704285},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.31459999084472656},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3111000061035156},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.3061999976634979},{"id":"https://openalex.org/C192327766","wikidata":"https://www.wikidata.org/wiki/Q1038799","display_name":"Cognitive robotics","level":3,"score":0.3043999969959259},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.29760000109672546},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.29510000348091125},{"id":"https://openalex.org/C82687282","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Auxiliary memory","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C178278151","wikidata":"https://www.wikidata.org/wiki/Q7936607","display_name":"Visual memory","level":3,"score":0.27889999747276306},{"id":"https://openalex.org/C197792726","wikidata":"https://www.wikidata.org/wiki/Q191062","display_name":"Mnemonic","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.26260000467300415},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.2513999938964844}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.03596","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.03596","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.03596","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.03596","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Conventionally,":[0],"memory":[1,25,37,60,95,123,170],"in":[2,18,124],"end-to-end":[3],"robotic":[4,99],"learning":[5],"involves":[6],"inputting":[7],"a":[8,44,70,134,158,162],"sequence":[9],"of":[10,33,52,110],"past":[11,28],"observations":[12],"into":[13],"the":[14,23,53,73],"learned":[15],"policy.":[16],"However,":[17],"complex":[19],"multi-stage":[20],"real-world":[21],"tasks,":[22],"robot's":[24],"must":[26],"represent":[27],"events":[29,64],"at":[30],"multiple":[31,103],"levels":[32,109],"granularity:":[34],"from":[35],"long-term":[36],"that":[38,61,92,149,169],"captures":[39,62],"abstracted":[40],"semantic":[41],"concepts":[42],"(e.g.,":[43,69],"robot":[45,71,125,144],"cooking":[46],"dinner":[47],"should":[48,101],"remember":[49],"which":[50],"stages":[51],"recipe":[54],"are":[55],"already":[56],"done)":[57],"to":[58,77,105,146,152,174],"short-term":[59],"recent":[63],"and":[65],"compensates":[66],"for":[67,97,120],"occlusions":[68],"remembering":[72],"object":[74],"it":[75],"wants":[76],"pick":[78],"up":[79,151,157],"once":[80],"its":[81],"arm":[82],"occludes":[83],"it).":[84],"In":[85],"this":[86],"work,":[87],"our":[88],"main":[89],"insight":[90],"is":[91],"an":[93,118],"effective":[94],"architecture":[96],"long-horizon":[98,122,139],"control":[100],"combine":[102],"modalities":[104],"capture":[106],"these":[107],"different":[108],"abstraction.":[111],"We":[112],"introduce":[113],"Multi-Scale":[114],"Embodied":[115],"Memory":[116],"(MEM),":[117],"approach":[119],"mixed-modal":[121],"policies.":[126],"MEM":[127,172],"combines":[128],"video-based":[129],"short-horizon":[130],"memory,":[131],"compressed":[132],"via":[133],"video":[135],"encoder,":[136],"with":[137],"text-based":[138],"memory.":[140],"Together,":[141],"they":[142],"enable":[143],"policies":[145,173],"perform":[147],"tasks":[148],"span":[150],"fifteen":[153],"minutes,":[154],"like":[155],"cleaning":[156],"kitchen,":[159],"or":[160],"preparing":[161],"grilled":[163],"cheese":[164],"sandwich.":[165],"Additionally,":[166],"we":[167],"find":[168],"enables":[171],"intelligently":[175],"adapt":[176],"manipulation":[177],"strategies":[178],"in-context.":[179]},"counts_by_year":[],"updated_date":"2026-05-04T08:30:34.212998","created_date":"2026-03-06T00:00:00"}
