{"id":"https://openalex.org/W7140208460","doi":"https://doi.org/10.48550/arxiv.2603.22280","title":"DualCoT-VLA: Visual-Linguistic Chain of Thought via Parallel Reasoning for Vision-Language-Action Models","display_name":"DualCoT-VLA: Visual-Linguistic Chain of Thought via Parallel Reasoning for Vision-Language-Action Models","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140208460","doi":"https://doi.org/10.48550/arxiv.2603.22280"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.22280","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22280","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.22280","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhong, Zhide","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhong, Zhide","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Junfeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Junfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"He, Junjie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Junjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yan, Haodong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Haodong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Gong, Xin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Xin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhao, Guanyi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Guanyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Cai, Yingjie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Yingjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Gao, Jiantao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Jiantao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yan, Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Xu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liu, Bingbing","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Bingbing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chen, Yingcong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yingcong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yang, Liuqing","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Liuqing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Li, Haoang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haoang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9945999979972839,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9945999979972839,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.0005000000237487257,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.00039999998989515007,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5688999891281128},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.477400004863739},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.460999995470047},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.43459999561309814},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.4323999881744385},{"id":"https://openalex.org/keywords/reasoning-system","display_name":"Reasoning system","score":0.4251999855041504},{"id":"https://openalex.org/keywords/opportunistic-reasoning","display_name":"Opportunistic reasoning","score":0.39430001378059387},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.38960000872612},{"id":"https://openalex.org/keywords/logical-conjunction","display_name":"Logical conjunction","score":0.3538999855518341}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7875000238418579},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5719000101089478},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5688999891281128},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.477400004863739},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.460999995470047},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.43459999561309814},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.4323999881744385},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.4251999855041504},{"id":"https://openalex.org/C86827895","wikidata":"https://www.wikidata.org/wiki/Q7098582","display_name":"Opportunistic reasoning","level":4,"score":0.39430001378059387},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.38960000872612},{"id":"https://openalex.org/C21847791","wikidata":"https://www.wikidata.org/wiki/Q191081","display_name":"Logical conjunction","level":2,"score":0.3538999855518341},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.34139999747276306},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.33090001344680786},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.32510000467300415},{"id":"https://openalex.org/C203702819","wikidata":"https://www.wikidata.org/wiki/Q17146953","display_name":"Logical data model","level":3,"score":0.310699999332428},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.30329999327659607},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.30239999294281006},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2865000069141388},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.28040000796318054},{"id":"https://openalex.org/C43971567","wikidata":"https://www.wikidata.org/wiki/Q3142865","display_name":"Logical reasoning","level":2,"score":0.2745000123977661},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C134752490","wikidata":"https://www.wikidata.org/wiki/Q374182","display_name":"Logical consequence","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C166088908","wikidata":"https://www.wikidata.org/wiki/Q308495","display_name":"Abductive reasoning","level":2,"score":0.2615000009536743},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.26019999384880066},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2531999945640564},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.25279998779296875},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.2515999972820282},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.22280","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22280","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.22280","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22280","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language-Action":[0],"(VLA)":[1],"models":[2,20,49,60,111],"map":[3],"visual":[4,72,126],"observations":[5],"and":[6,74,132,179],"language":[7],"instructions":[8],"directly":[9],"to":[10,46,68,79,141,163],"robotic":[11],"actions.":[12],"While":[13],"effective":[14],"for":[15,109,128,136],"simple":[16],"tasks,":[17],"standard":[18],"VLA":[19,48,59,110],"often":[21],"struggle":[22],"with":[23,50,90,112],"complex,":[24],"multi-step":[25],"tasks":[26],"requiring":[27],"logical":[28,76],"planning,":[29],"as":[30,32,183,185],"well":[31,184],"precise":[33],"manipulations":[34],"demanding":[35],"fine-grained":[36],"spatial":[37,130],"perception.":[38],"Recent":[39],"efforts":[40],"have":[41],"incorporated":[42],"Chain-of-Thought":[43],"(CoT)":[44],"reasoning":[45,115,162],"endow":[47],"a":[51,105,113,125,133,148],"``thinking":[52],"before":[53],"acting''":[54],"capability.":[55],"However,":[56],"current":[57],"CoT-based":[58],"face":[61],"two":[62,154],"critical":[63],"limitations:":[64],"1)":[65],"an":[66],"inability":[67],"simultaneously":[69],"capture":[70],"low-level":[71,129],"details":[73],"high-level":[75,137],"planning":[77],"due":[78],"their":[80],"reliance":[81],"on":[82,176],"isolated,":[83],"single-modal":[84],"CoT;":[85],"2)":[86],"high":[87],"inference":[88],"latency":[89,144],"compounding":[91],"errors":[92],"caused":[93],"by":[94],"step-by-step":[95],"autoregressive":[96,161],"decoding.":[97],"To":[98,117],"address":[99],"these":[100],"limitations,":[101],"we":[102,146],"propose":[103],"DualCoT-VLA,":[104],"visual-linguistic":[106],"CoT":[107,127,135,150],"method":[108,123],"parallel":[114,149],"mechanism.":[116],"achieve":[118],"comprehensive":[119],"multi-modal":[120],"reasoning,":[121],"our":[122,171],"integrates":[124],"understanding":[131],"linguistic":[134],"task":[138],"planning.":[139],"Furthermore,":[140],"overcome":[142],"the":[143,177],"bottleneck,":[145],"introduce":[147],"mechanism":[151],"that":[152,170],"incorporates":[153],"sets":[155],"of":[156],"learnable":[157],"query":[158],"tokens,":[159],"shifting":[160],"single-step":[164],"forward":[165],"reasoning.":[166],"Extensive":[167],"experiments":[168],"demonstrate":[169],"DualCoT-VLA":[172],"achieves":[173],"state-of-the-art":[174],"performance":[175],"LIBERO":[178],"RoboCasa":[180],"GR1":[181],"benchmarks,":[182],"in":[186],"real-world":[187],"platforms.":[188]},"counts_by_year":[],"updated_date":"2026-04-25T08:17:42.794288","created_date":"2026-03-25T00:00:00"}
