{"id":"https://openalex.org/W7140294828","doi":"https://doi.org/10.48550/arxiv.2603.22458","title":"MinerU-Diffusion: Rethinking Document OCR as Inverse Rendering via Diffusion Decoding","display_name":"MinerU-Diffusion: Rethinking Document OCR as Inverse Rendering via Diffusion Decoding","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140294828","doi":"https://doi.org/10.48550/arxiv.2603.22458"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.22458","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22458","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.22458","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130581926","display_name":"Hejun Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Dong, Hejun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085823891","display_name":"Junbo Niu","orcid":"https://orcid.org/0000-0002-2135-6853"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Niu, Junbo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130598930","display_name":"Bin Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Bin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130559520","display_name":"Weijun Zeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Weijun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130545085","display_name":"Wentao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Wentao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130591892","display_name":"Conghui He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Conghui","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5130581926"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.982699990272522,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.982699990272522,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.0019000000320374966,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.00139999995008111,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.6510000228881836},{"id":"https://openalex.org/keywords/rendering","display_name":"Rendering (computer graphics)","score":0.6365000009536743},{"id":"https://openalex.org/keywords/prior-probability","display_name":"Prior probability","score":0.608299970626831},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.597599983215332},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.5565999746322632},{"id":"https://openalex.org/keywords/serialization","display_name":"Serialization","score":0.5242000222206116},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.37400001287460327}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7639999985694885},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.6510000228881836},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.6365000009536743},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.608299970626831},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.597599983215332},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.5565999746322632},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5415999889373779},{"id":"https://openalex.org/C52723943","wikidata":"https://www.wikidata.org/wiki/Q1127410","display_name":"Serialization","level":2,"score":0.5242000222206116},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3953000009059906},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.37400001287460327},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.36579999327659607},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.36480000615119934},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.32710000872612},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32670000195503235},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3041999936103821},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.29159998893737793},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2775999903678894},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2565999925136566},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.2549000084400177},{"id":"https://openalex.org/C73301696","wikidata":"https://www.wikidata.org/wiki/Q5469984","display_name":"Formalism (music)","level":3,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.22458","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22458","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.22458","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22458","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6839817762374878,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Optical":[0],"character":[1],"recognition":[2],"(OCR)":[3],"has":[4],"evolved":[5],"from":[6,55],"line-level":[7],"transcription":[8],"to":[9,15,113,132,137],"structured":[10],"document":[11,53],"parsing,":[12],"requiring":[13],"models":[14],"recover":[16],"long-form":[17],"sequences":[18],"containing":[19],"layout,":[20],"tables,":[21],"and":[22,41,107,117,155],"formulas.":[23],"Despite":[24],"recent":[25],"advances":[26],"in":[27,45],"vision-language":[28],"models,":[29],"most":[30],"existing":[31],"systems":[32],"rely":[33],"on":[34,141,152],"autoregressive":[35,91,138],"decoding,":[36],"which":[37],"introduces":[38],"sequential":[39,92],"latency":[40],"amplifies":[42],"error":[43],"propagation":[44],"long":[46],"documents.":[47],"In":[48],"this":[49,80],"work,":[50],"we":[51,82],"revisit":[52],"OCR":[54,158],"an":[56,66,72,108],"inverse":[57],"rendering":[58],"perspective,":[59],"arguing":[60],"that":[61,89,124],"left-to-right":[62],"causal":[63],"generation":[64],"is":[65],"artifact":[67],"of":[68,75],"serialization":[69],"rather":[70],"than":[71],"intrinsic":[73],"property":[74],"the":[76,142],"task.":[77],"Motivated":[78],"by":[79],"insight,":[81],"propose":[83],"MinerU-Diffusion,":[84],"a":[85,103],"unified":[86],"diffusion-based":[87],"framework":[88],"replaces":[90],"decoding":[93,135],"with":[94],"parallel":[95],"diffusion":[96,105],"denoising":[97],"under":[98],"visual":[99,157],"conditioning.":[100],"MinerU-Diffusion":[101,125],"employs":[102],"block-wise":[104],"decoder":[106],"uncertainty-driven":[109],"curriculum":[110],"learning":[111],"strategy":[112],"enable":[114],"stable":[115],"training":[116],"efficient":[118],"long-sequence":[119],"inference.":[120],"Extensive":[121],"experiments":[122],"demonstrate":[123],"consistently":[126],"improves":[127],"robustness":[128],"while":[129],"achieving":[130],"up":[131],"3.2x":[133],"faster":[134],"compared":[136],"baselines.":[139],"Evaluations":[140],"proposed":[143],"Semantic":[144],"Shuffle":[145],"benchmark":[146],"further":[147],"confirm":[148],"its":[149],"reduced":[150],"dependence":[151],"linguistic":[153],"priors":[154],"stronger":[156],"capability.":[159]},"counts_by_year":[],"updated_date":"2026-03-26T06:10:45.909354","created_date":"2026-03-26T00:00:00"}
