{"id":"https://openalex.org/W7162767791","doi":"https://doi.org/10.48550/arxiv.2605.29577","title":"Mitigating State Aliasing in Vision-Language-Action Models via Inverse Dynamics Learning","display_name":"Mitigating State Aliasing in Vision-Language-Action Models via Inverse Dynamics Learning","publication_year":2026,"publication_date":"2026-05-28","ids":{"openalex":"https://openalex.org/W7162767791","doi":"https://doi.org/10.48550/arxiv.2605.29577"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.29577","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.29577","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.29577","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137372445","display_name":"Kyujin Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Kyujin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091592225","display_name":"Injae Kim","orcid":"https://orcid.org/0000-0002-7483-8469"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Injae","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137394675","display_name":"Jihwan Park","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Park, Jihwan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137399270","display_name":"Yejun Ju","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ju, Yejun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104160500","display_name":"Minseok Joo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Joo, Minseok","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137394968","display_name":"Hyunwoo J. Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Hyunwoo J.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8736000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8736000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.05050000175833702,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.013000000268220901,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/aliasing","display_name":"Aliasing","score":0.7810999751091003},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6184999942779541},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5827000141143799},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5735999941825867},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5102999806404114},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5084999799728394},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.4916999936103821},{"id":"https://openalex.org/keywords/monocular","display_name":"Monocular","score":0.4438999891281128}],"concepts":[{"id":"https://openalex.org/C4069607","wikidata":"https://www.wikidata.org/wiki/Q868732","display_name":"Aliasing","level":3,"score":0.7810999751091003},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6886000037193298},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6765000224113464},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6184999942779541},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5827000141143799},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5735999941825867},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5102999806404114},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5084999799728394},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.4916999936103821},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4571000039577484},{"id":"https://openalex.org/C65909025","wikidata":"https://www.wikidata.org/wiki/Q1945033","display_name":"Monocular","level":2,"score":0.4438999891281128},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4377000033855438},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.4083999991416931},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.3815000057220459},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.37540000677108765},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.34040001034736633},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.33709999918937683},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.31450000405311584},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.29179999232292175},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.2745000123977661},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.26899999380111694},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.2508000135421753},{"id":"https://openalex.org/C126780896","wikidata":"https://www.wikidata.org/wiki/Q899871","display_name":"Distortion (music)","level":4,"score":0.2502000033855438}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.29577","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.29577","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.29577","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.29577","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.6942394375801086,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language-Action":[0],"(VLA)":[1],"models":[2,22],"have":[3],"emerged":[4],"as":[5,66,114],"a":[6,158],"promising":[7],"framework":[8],"that":[9,48,118,144,216,223],"unifies":[10],"perception,":[11],"reasoning,":[12],"and":[13,92,131,164,186,199,210,227],"control":[14],"for":[15,38],"robot":[16,169,231],"manipulation":[17],"by":[18,59],"adapting":[19],"pretrained":[20],"vision-language":[21],"(VLMs)":[23],"to":[24,33,139,153,157,174],"action":[25,128,162],"prediction.":[26],"However,":[27],"VLM-derived":[28],"representations":[29,222],"are":[30],"often":[31],"insensitive":[32],"subtle":[34],"visual":[35,57,61,102,142,221],"distinctions":[36,143],"required":[37],"low-level":[39,146],"control,":[40],"causing":[41],"state":[42,97,107,225,232],"aliasing":[43,98,226],"between":[44,129],"visually":[45],"similar":[46],"states":[47],"require":[49],"substantially":[50],"different":[51],"actions.":[52,147],"Prior":[53],"VLA":[54,122,176,206],"studies":[55],"improve":[56,165],"understanding":[58],"generating":[60],"or":[62,72,74],"reasoning":[63,77],"outputs,":[64],"such":[65],"future":[67,132],"frames,":[68],"2D":[69],"grounding":[70],"points":[71],"traces,":[73],"intermediate":[75],"spatial":[76],"steps,":[78],"but":[79],"these":[80],"objectives":[81],"typically":[82],"shape":[83],"the":[84,100,121,127,137,155,188],"vision":[85,123],"encoder":[86,138,156],"only":[87,179],"indirectly":[88],"through":[89],"end-to-end":[90],"prediction":[91],"do":[93],"not":[94],"explicitly":[95],"analyze":[96],"in":[99],"learned":[101],"feature":[103],"space.":[104],"To":[105],"mitigate":[106],"aliasing,":[108],"we":[109],"introduce":[110],"inverse":[111],"dynamics":[112],"learning":[113],"an":[115],"auxiliary":[116],"objective":[117,135],"directly":[119],"supervises":[120],"encoder.":[124],"By":[125],"predicting":[126],"current":[130],"observations,":[133],"our":[134,217],"encourages":[136],"capture":[140],"fine-grained":[141],"determine":[145],"We":[148],"further":[149,214],"use":[150],"pseudo-reversed":[151],"supervision":[152],"expose":[154],"broader":[159],"range":[160],"of":[161],"directions":[163],"generalization":[166],"under":[167],"limited":[168],"demonstrations.":[170],"Our":[171],"method":[172,218],"applies":[173],"diverse":[175,205],"baselines,":[177],"uses":[178],"standard":[180],"observation-action":[181],"pairs":[182],"without":[183],"additional":[184],"annotations,":[185],"preserves":[187],"original":[189],"inference":[190],"pipeline":[191],"at":[192],"test":[193],"time.":[194],"Experiments":[195],"on":[196],"CALVIN":[197],"ABC-D":[198],"SimplerEnv":[200],"show":[201,215],"consistent":[202],"gains":[203],"across":[204],"baselines.":[207],"Frozen-encoder":[208],"probing":[209],"state-feature":[211],"alignment":[212],"analyses":[213],"learns":[219],"state-discriminative":[220],"reduce":[224],"better":[228],"align":[229],"with":[230],"changes.":[233]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-30T00:00:00"}
