{"id":"https://openalex.org/W7155391937","doi":"https://doi.org/10.48550/arxiv.2604.20012","title":"EmbodiedMidtrain: Bridging the Gap between Vision-Language Models and Vision-Language-Action Models via Mid-training","display_name":"EmbodiedMidtrain: Bridging the Gap between Vision-Language Models and Vision-Language-Action Models via Mid-training","publication_year":2026,"publication_date":"2026-04-21","ids":{"openalex":"https://openalex.org/W7155391937","doi":"https://doi.org/10.48550/arxiv.2604.20012"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.20012","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20012","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.20012","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134399404","display_name":"Yiyang Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Du, Yiyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103029425","display_name":"Zhaohui Guo","orcid":"https://orcid.org/0000-0003-1193-5816"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Zhanqiu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134412261","display_name":"Xin Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Xin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134462263","display_name":"Liu Ren","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ren, Liu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134376831","display_name":"Chenyan Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Chenyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5134399404"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9837999939918518,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9837999939918518,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0020000000949949026,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.0010999999940395355,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.7598999738693237},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.5310999751091003},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.5054000020027161},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.4449999928474426},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.43140000104904175},{"id":"https://openalex.org/keywords/data-driven","display_name":"Data-driven","score":0.42890000343322754},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.42820000648498535},{"id":"https://openalex.org/keywords/downstream","display_name":"Downstream (manufacturing)","score":0.40779998898506165},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.353300005197525}],"concepts":[{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.7598999738693237},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5605999827384949},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.5310999751091003},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.5054000020027161},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.4449999928474426},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.43140000104904175},{"id":"https://openalex.org/C2780440489","wikidata":"https://www.wikidata.org/wiki/Q5227278","display_name":"Data-driven","level":2,"score":0.42890000343322754},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.42820000648498535},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.40779998898506165},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.353300005197525},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34540000557899475},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.3424000144004822},{"id":"https://openalex.org/C2908647359","wikidata":"https://www.wikidata.org/wiki/Q2625603","display_name":"Population","level":2,"score":0.3294000029563904},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.3149999976158142},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.313400000333786},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.3021000027656555},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.2955999970436096},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.2892000079154968},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.28700000047683716},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.2858999967575073},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.28279998898506165},{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C32230216","wikidata":"https://www.wikidata.org/wiki/Q7882499","display_name":"Uncertainty quantification","level":2,"score":0.27140000462532043},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26969999074935913},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26969999074935913},{"id":"https://openalex.org/C191172861","wikidata":"https://www.wikidata.org/wiki/Q7899321","display_name":"Upstream (networking)","level":2,"score":0.26840001344680786},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C136643341","wikidata":"https://www.wikidata.org/wiki/Q1361526","display_name":"Reachability","level":2,"score":0.25699999928474426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.20012","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20012","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.20012","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20012","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5773970484733582,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language-Action":[0],"Models":[1,11],"(VLAs)":[2],"inherit":[3],"their":[4,30],"visual":[5],"and":[6,45,81,111,145,153,175,186,210],"linguistic":[7],"capabilities":[8],"from":[9,18,67,106,171],"Vision-Language":[10],"(VLMs),":[12],"yet":[13],"most":[14,103],"VLAs":[15,144],"are":[16,22,64],"built":[17],"off-the-shelf":[19,146],"VLMs":[20,44,147],"that":[21,57,63,93,130,159],"not":[23],"adapted":[24],"to":[25,39,100],"the":[26,41,50,68,73,102,113,172,180,198,201],"embodied":[27],"domain,":[28],"limiting":[29],"downstream":[31,120],"performance.":[32],"In":[33],"this":[34,116],"work,":[35],"we":[36,87],"propose":[37],"EmbodiedMidtrain":[38],"bridge":[40],"gap":[42,53],"between":[43,54],"VLAs.":[46],"We":[47,204],"first":[48],"characterize":[49],"data":[51,59,84,91,181,209],"distribution":[52],"them,":[55],"showing":[56],"VLA":[58,121,166],"occupy":[60],"compact":[61],"regions":[62],"largely":[65],"separated":[66],"broader":[69],"VLM":[70,83,109,114,137,202],"distribution,":[71],"while":[72,196],"degree":[74],"of":[75,200],"alignment":[76,188],"varies":[77],"substantially":[78],"both":[79,184],"across":[80,135],"within":[82],"sources.":[85],"Then,":[86],"build":[88],"a":[89,95,107,162],"mid-training":[90,131,160],"engine":[92,182],"leverages":[94],"lightweight":[96],"learnable":[97],"proximity":[98],"estimator":[99],"select":[101],"VLA-aligned":[104],"candidates":[105],"large":[108],"pool,":[110],"mid-trains":[112],"on":[115,124],"curated":[117],"mixture":[118],"before":[119],"fine-tuning.":[122],"Experiments":[123],"three":[125],"robot":[126],"manipulation":[127],"benchmarks":[128],"show":[129],"consistently":[132],"improves":[133],"performance":[134],"different":[136],"backbones,":[138],"achieving":[139],"results":[140],"competitive":[141],"with":[142,149,168],"expert":[143],"trained":[148],"larger":[150],"model":[151],"scale":[152],"training":[154],"budgets.":[155],"Further":[156],"analysis":[157],"reveals":[158],"provides":[161],"stronger":[163],"initialization":[164],"for":[165,212],"fine-tuning,":[167],"gains":[169],"emerging":[170],"earliest":[173],"steps":[174],"widening":[176],"throughout":[177],"training.":[178],"Moreover,":[179],"captures":[183],"dataset-level":[185],"sample-level":[187],"signals,":[189],"favoring":[190],"spatial":[191],"reasoning":[192],"over":[193],"text-centric":[194],"tasks":[195],"preserving":[197],"diversity":[199],"data.":[203],"will":[205],"release":[206],"all":[207],"code,":[208],"models":[211],"future":[213],"research.":[214]},"counts_by_year":[],"updated_date":"2026-04-24T06:07:52.864757","created_date":"2026-04-24T00:00:00"}
