{"id":"https://openalex.org/W7128781553","doi":"https://doi.org/10.48550/arxiv.2602.12279","title":"UniT: Unified Multimodal Chain-of-Thought Test-time Scaling","display_name":"UniT: Unified Multimodal Chain-of-Thought Test-time Scaling","publication_year":2026,"publication_date":"2026-02-12","ids":{"openalex":"https://openalex.org/W7128781553","doi":"https://doi.org/10.48550/arxiv.2602.12279"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.12279","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123479695","display_name":"Leon Liangyu Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Leon Liangyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125948576","display_name":"Haoyu Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Haoyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124387732","display_name":"Zhipeng Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Zhipeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101830887","display_name":"Ziqi Huang","orcid":"https://orcid.org/0000-0001-8008-5873"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Ziqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007075153","display_name":"Animesh A. Sinha","orcid":"https://orcid.org/0000-0003-3213-7813"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sinha, Animesh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125933605","display_name":"Xiaoliang Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Xiaoliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125990237","display_name":"Jialiang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jialiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125897701","display_name":"Zecheng He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Zecheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125949344","display_name":"Jianwei Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Jianwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125978543","display_name":"Chunyuan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chunyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107906906","display_name":"J. Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Junzhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125918373","display_name":"Chu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Chu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125925179","display_name":"Serena Yeung-Levy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yeung-Levy, Serena","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125895207","display_name":"Felix Juefei-Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Juefei-Xu, Felix","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":["https://openalex.org/A5123479695"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.890999972820282,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.890999972820282,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.041999999433755875,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.008700000122189522,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7281000018119812},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5839999914169312},{"id":"https://openalex.org/keywords/unified-model","display_name":"Unified Model","score":0.5223000049591064},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4878999888896942},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.42750000953674316}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7383999824523926},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7281000018119812},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5839999914169312},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5241000056266785},{"id":"https://openalex.org/C45493050","wikidata":"https://www.wikidata.org/wiki/Q7884934","display_name":"Unified Model","level":2,"score":0.5223000049591064},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4878999888896942},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.42750000953674316},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38260000944137573},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.36309999227523804},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.35690000653266907},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.27709999680519104}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.12279","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.12279","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.12279","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.12279","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Unified":[0],"models":[1,76,134],"can":[2],"handle":[3],"both":[4,186],"multimodal":[5,27,75,87,177],"understanding":[6,189],"and":[7,47,100,113,125,156,167,188],"generation":[8,166,187],"within":[9],"a":[10,18,84,93,153],"single":[11,19,94],"architecture,":[12],"yet":[13],"they":[14],"typically":[15],"operate":[16],"in":[17,190],"pass":[20],"without":[21],"iteratively":[22],"refining":[23],"their":[24],"outputs.":[25],"Many":[26],"tasks,":[28],"especially":[29],"those":[30],"involving":[31],"complex":[32],"spatial":[33],"compositions,":[34],"multiple":[35,103],"interacting":[36],"objects,":[37],"or":[38],"evolving":[39],"instructions,":[40,43],"require":[41],"decomposing":[42],"verifying":[44],"intermediate":[45],"results,":[46],"making":[48],"iterative":[49,63],"corrections.":[50],"While":[51],"test-time":[52,89,115,178],"scaling":[53,90,179],"(TTS)":[54],"has":[55],"demonstrated":[56],"that":[57,91],"allocating":[58],"additional":[59],"inference":[60,116,143],"compute":[61],"for":[62,86,184],"reasoning":[64,138,151],"substantially":[65],"improves":[66,170],"language":[67],"model":[68,96,111],"performance,":[69],"extending":[70],"this":[71],"paradigm":[72,183],"to":[73,97,117,141],"unified":[74,95,110,133,191],"remains":[77],"an":[78,181],"open":[79],"challenge.":[80],"We":[81],"introduce":[82],"UniT,":[83],"framework":[85],"chain-of-thought":[88,150],"enables":[92],"reason,":[98],"verify,":[99],"refine":[101],"across":[102],"rounds.":[104],"UniT":[105],"combines":[106],"agentic":[107],"data":[108],"synthesis,":[109],"training,":[112],"flexible":[114],"elicit":[118],"cognitive":[119],"behaviors":[120],"including":[121],"verification,":[122],"subgoal":[123],"decomposition,":[124],"content":[126],"memory.":[127],"Our":[128],"key":[129],"findings":[130],"are:":[131],"(1)":[132],"trained":[135],"on":[136,165],"short":[137],"trajectories":[139,169],"generalize":[140],"longer":[142],"chains":[144],"at":[145],"test":[146],"time;":[147],"(2)":[148],"sequential":[149],"provides":[152],"more":[154],"scalable":[155],"compute-efficient":[157],"TTS":[158],"strategy":[159],"than":[160],"parallel":[161],"sampling;":[162],"(3)":[163],"training":[164],"editing":[168],"out-of-distribution":[171],"visual":[172],"reasoning.":[173],"These":[174],"results":[175],"establish":[176],"as":[180],"effective":[182],"advancing":[185],"models.":[192]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-14T00:00:00"}
