{"id":"https://openalex.org/W7135231613","doi":"https://doi.org/10.48550/arxiv.2603.12252","title":"EndoCoT: Scaling Endogenous Chain-of-Thought Reasoning in Diffusion Models","display_name":"EndoCoT: Scaling Endogenous Chain-of-Thought Reasoning in Diffusion Models","publication_year":2026,"publication_date":"2026-03-12","ids":{"openalex":"https://openalex.org/W7135231613","doi":"https://doi.org/10.48550/arxiv.2603.12252"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.12252","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12252","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.12252","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129013739","display_name":"Xuanlang Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Dai, Xuanlang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129057482","display_name":"Yujie Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yujie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128924862","display_name":"Long Xing","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xing, Long","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128923678","display_name":"Jiazi Bu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bu, Jiazi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113244350","display_name":"Xilin Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Xilin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129054905","display_name":"Yuhong Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yuhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129083497","display_name":"Beichen Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Beichen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129067547","display_name":"Kai Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129014568","display_name":"Yuhang Zang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zang, Yuhang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5129013739"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.429500013589859,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.429500013589859,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.23180000483989716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.07289999723434448,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.7085999846458435},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.6309000253677368},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.585099995136261},{"id":"https://openalex.org/keywords/invariant","display_name":"Invariant (physics)","score":0.527899980545044},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.4178999960422516},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.4009999930858612},{"id":"https://openalex.org/keywords/macro","display_name":"Macro","score":0.3612000048160553},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.3346000015735626}],"concepts":[{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.7085999846458435},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6912999749183655},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.6309000253677368},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.585099995136261},{"id":"https://openalex.org/C190470478","wikidata":"https://www.wikidata.org/wiki/Q2370229","display_name":"Invariant (physics)","level":2,"score":0.527899980545044},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5110999941825867},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.4178999960422516},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.4009999930858612},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3785000145435333},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3643999993801117},{"id":"https://openalex.org/C166955791","wikidata":"https://www.wikidata.org/wiki/Q629579","display_name":"Macro","level":2,"score":0.3612000048160553},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.3346000015735626},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.3249000012874603},{"id":"https://openalex.org/C2781170535","wikidata":"https://www.wikidata.org/wiki/Q30587856","display_name":"Noisy data","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C2780069185","wikidata":"https://www.wikidata.org/wiki/Q7977945","display_name":"Equivalence (formal languages)","level":2,"score":0.2937000095844269},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C88626702","wikidata":"https://www.wikidata.org/wiki/Q1128903","display_name":"Continuation","level":2,"score":0.2702000141143799},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.26649999618530273},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26350000500679016},{"id":"https://openalex.org/C2781085045","wikidata":"https://www.wikidata.org/wiki/Q7318308","display_name":"Reversing","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C47822265","wikidata":"https://www.wikidata.org/wiki/Q854457","display_name":"Complex system","level":2,"score":0.25360000133514404}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.12252","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12252","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.12252","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12252","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recently,":[0],"Multimodal":[1],"Large":[2],"Language":[3],"Models":[4],"(MLLMs)":[5],"have":[6],"been":[7],"widely":[8],"integrated":[9],"into":[10,81],"diffusion":[11],"frameworks":[12],"primarily":[13],"as":[14,22],"text":[15,35,161],"encoders":[16],"to":[17,44,54,124,137,170],"tackle":[18],"complex":[19,59,79,177],"tasks":[20,178],"such":[21],"spatial":[23],"reasoning.":[24],"However,":[25],"this":[26,91],"paradigm":[27],"suffers":[28],"from":[29,76],"two":[30,157],"critical":[31],"limitations:":[32],"(i)":[33],"MLLMs":[34,53],"encoder":[36,162],"exhibits":[37],"insufficient":[38],"reasoning":[39,105,140],"depth.":[40],"Single-step":[41],"encoding":[42],"fails":[43],"activate":[45],"the":[46,67,125,139,149,159,168,201],"Chain-of-Thought":[47,96],"process,":[48],"which":[49],"is":[50,135],"essential":[51],"for":[52,58],"provide":[55],"accurate":[56],"guidance":[57,63,71,117],"tasks.":[60],"(ii)":[61],"The":[62,208],"remains":[64,142],"invariant":[65],"during":[66,72],"decoding":[68,73],"process.":[69,128],"Invariant":[70],"prevents":[74],"DiT":[75,169],"progressively":[77,173],"decomposing":[78],"instructions":[80],"actionable":[82],"denoising":[83,127],"steps,":[84],"even":[85],"with":[86,152],"correct":[87],"MLLM":[88,160],"encodings.":[89],"To":[90],"end,":[92],"we":[93],"propose":[94],"Endogenous":[95],"(EndoCoT),":[97],"a":[98,130,180],"novel":[99],"framework":[100],"that":[101],"first":[102],"activates":[103],"MLLMs'":[104],"potential":[106],"by":[107,147,204],"iteratively":[108],"refining":[109],"latent":[110],"thought":[111,116,132],"states":[112,123],"through":[113],"an":[114,195],"iterative":[115],"module,":[118],"and":[119,174,192,210],"then":[120],"bridges":[121],"these":[122,156],"DiT's":[126],"Second,":[129],"terminal":[131],"grounding":[133],"module":[134],"applied":[136],"ensure":[138],"trajectory":[141],"grounded":[143],"in":[144,179],"textual":[145],"supervision":[146],"aligning":[148],"final":[150],"state":[151],"ground-truth":[153],"answers.":[154],"With":[155],"components,":[158],"delivers":[163],"meticulously":[164],"reasoned":[165],"guidance,":[166],"enabling":[167],"execute":[171],"it":[172],"ultimately":[175],"solve":[176],"step-by-step":[181],"manner.":[182],"Extensive":[183],"evaluations":[184],"across":[185],"diverse":[186],"benchmarks":[187],"(e.g.,":[188],"Maze,":[189],"TSP,":[190],"VSP,":[191],"Sudoku)":[193],"achieve":[194],"average":[196],"accuracy":[197],"of":[198],"92.1%,":[199],"outperforming":[200],"strongest":[202],"baseline":[203],"8.3":[205],"percentage":[206],"points.":[207],"code":[209],"dataset":[211],"are":[212],"publicly":[213],"available":[214],"at":[215],"https://lennoxdai.github.io/EndoCoT-Webpage/.":[216]},"counts_by_year":[],"updated_date":"2026-03-18T06:27:02.140700","created_date":"2026-03-14T00:00:00"}
