{"id":"https://openalex.org/W7140194307","doi":"https://doi.org/10.48550/arxiv.2603.22078","title":"Do World Action Models Generalize Better than VLAs? A Robustness Study","display_name":"Do World Action Models Generalize Better than VLAs? A Robustness Study","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140194307","doi":"https://doi.org/10.48550/arxiv.2603.22078"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.22078","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22078","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.22078","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhang, Zhanguang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhanguang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Zhiyuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Zhiyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Rahmati, Behnam","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rahmati, Behnam","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yang, Rui Heng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Rui Heng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ma, Yintao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Yintao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Rasouli, Amir","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rasouli, Amir","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Pakdamansavoji, Sajjad","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pakdamansavoji, Sajjad","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wu, Yangzheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yangzheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Lingfeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Lingfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Cao, Tongtong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Tongtong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wen, Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen, Feng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Xinyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xinyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Quan, Xingyue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Quan, Xingyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Zhang, Yingxue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yingxue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8432000279426575,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8432000279426575,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.04529999941587448,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.033799998462200165,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7587000131607056},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.46160000562667847},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.42239999771118164},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.3926999866962433},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.3806999921798706},{"id":"https://openalex.org/keywords/prior-probability","display_name":"Prior probability","score":0.3714999854564667},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.3278999924659729}],"concepts":[{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7587000131607056},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6937999725341797},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6740000247955322},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.6333000063896179},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.46160000562667847},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.42239999771118164},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3926999866962433},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.3806999921798706},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.3714999854564667},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.3278999924659729},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.32670000195503235},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.32269999384880066},{"id":"https://openalex.org/C114073186","wikidata":"https://www.wikidata.org/wiki/Q2631895","display_name":"Automated planning and scheduling","level":2,"score":0.3098999857902527},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.29919999837875366},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.2700999975204468},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.26910001039505005},{"id":"https://openalex.org/C17098449","wikidata":"https://www.wikidata.org/wiki/Q176814","display_name":"Partially observable Markov decision process","level":4,"score":0.2630999982357025}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.22078","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22078","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.22078","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22078","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Robot":[0],"action":[1,41,44,98],"planning":[2],"in":[3,28],"the":[4,15,19,62,180,252],"real":[5],"world":[6,82,97,104],"is":[7],"challenging":[8],"as":[9,87,96,218],"it":[10,25],"requires":[11],"not":[12],"only":[13],"understanding":[14],"current":[16],"state":[17],"of":[18,53,64,112,166,254],"environment":[20],"but":[21],"also":[22],"predicting":[23],"how":[24,255],"will":[26],"evolve":[27],"response":[29],"to":[30,71,76,90,95,115,152],"actions.":[31,130],"Vision-language-action":[32],"(VLA),":[33],"which":[34],"repurpose":[35],"large-scale":[36],"vision-language":[37],"models":[38,83,99,105],"for":[39],"robot":[40,129],"generation":[42],"using":[43],"experts,":[45],"have":[46,84],"achieved":[47],"notable":[48],"success":[49,204],"across":[50],"a":[51,163],"variety":[52],"robotic":[54,234],"tasks.":[55],"Nevertheless,":[56],"their":[57,65,122,136,177],"performance":[58,178],"remains":[59],"constrained":[60],"by":[61],"scope":[63],"training":[66,231],"data,":[67],"exhibiting":[68],"limited":[69],"generalization":[70],"unseen":[72],"scenarios":[73],"and":[74,171,182,189,209,236],"vulnerability":[75],"diverse":[77,233],"contextual":[78],"perturbations.":[79,191],"More":[80],"recently,":[81],"been":[85,133],"revisited":[86],"an":[88],"alternative":[89],"VLAs.":[91,157],"These":[92],"models,":[93],"referred":[94],"(WAMs),":[100],"are":[101,107,258],"built":[102],"upon":[103],"that":[106,135,195,242],"trained":[108],"on":[109,179,206,213,224],"large":[110],"corpora":[111],"video":[113,148,256],"data":[114],"predict":[116],"future":[117],"states.":[118],"With":[119],"minor":[120],"adaptations,":[121],"latent":[123],"representation":[124],"can":[125,220],"be":[126],"decoded":[127],"into":[128],"It":[131],"has":[132],"suggested":[134],"explicit":[137],"dynamic":[138,246],"prediction":[139],"capacity,":[140],"combined":[141],"with":[142,200,232],"spatiotemporal":[143],"priors":[144,257],"acquired":[145],"from":[146],"web-scale":[147],"pretraining,":[149],"enables":[150],"WAMs":[151,196],"generalize":[153],"more":[154],"effectively":[155],"than":[156],"In":[158],"this":[159],"paper,":[160],"we":[161],"conduct":[162],"comparative":[164],"study":[165],"prominent":[167],"state-of-the-art":[168],"VLA":[169],"policies":[170],"recently":[172],"released":[173],"WAMs.":[174],"We":[175],"evaluate":[176],"LIBERO-Plus":[181],"RoboTwin":[183,207],"2.0-Plus":[184,208],"benchmarks":[185],"under":[186],"various":[187],"visual":[188],"language":[190],"Our":[192],"results":[193],"show":[194],"achieve":[197,221],"strong":[198],"robustness,":[199,250],"LingBot-VA":[201],"reaching":[202],"74.2%":[203],"rate":[205],"Cosmos-Policy":[210],"achieving":[211],"82.2%":[212],"LIBERO-Plus.":[214],"While":[215],"VLAs":[216],"such":[217],"$\u03c0_{0.5}$":[219],"comparable":[222],"robustness":[223],"certain":[225],"tasks,":[226],"they":[227],"typically":[228],"require":[229],"extensive":[230],"datasets":[235],"varied":[237],"learning":[238,247],"objectives.":[239],"Hybrid":[240],"approaches":[241],"partially":[243],"incorporate":[244],"video-based":[245],"exhibit":[248],"intermediate":[249],"highlighting":[251],"importance":[253],"integrated.":[259]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-25T00:00:00"}
