{"id":"https://openalex.org/W7147713404","doi":"https://doi.org/10.48550/arxiv.2603.29634","title":"MacTok: Robust Continuous Tokenization for Image Generation","display_name":"MacTok: Robust Continuous Tokenization for Image Generation","publication_year":2026,"publication_date":"2026-03-31","ids":{"openalex":"https://openalex.org/W7147713404","doi":"https://doi.org/10.48550/arxiv.2603.29634"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.29634","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29634","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.29634","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130758226","display_name":"Hengyu Zeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zeng, Hengyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132553818","display_name":"Xin Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132656149","display_name":"Guanghao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Guanghao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132596951","display_name":"Yuxiang Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Yuxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132563348","display_name":"Jiaoyang Ruan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ruan, Jiaoyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132686437","display_name":"Junpeng Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Junpeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132597889","display_name":"Haoyu Albert Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Haoyu Albert","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132695140","display_name":"Jian Pu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pu, Jian","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5130758226"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9355000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9355000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12357","display_name":"Digital Media Forensic Detection","score":0.01600000075995922,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11019","display_name":"Image Enhancement Techniques","score":0.006599999964237213,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.6273000240325928},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5942999720573425},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.5677000284194946},{"id":"https://openalex.org/keywords/masking","display_name":"Masking (illustration)","score":0.5641999840736389},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.46230000257492065},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.44179999828338623},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4171000123023987},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.38749998807907104}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.718999981880188},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.65420001745224},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.6273000240325928},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5942999720573425},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.5677000284194946},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.5641999840736389},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.46230000257492065},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.44179999828338623},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4171000123023987},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.38749998807907104},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3537999987602234},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3472999930381775},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.34139999747276306},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C197115733","wikidata":"https://www.wikidata.org/wiki/Q1003136","display_name":"Forcing (mathematics)","level":2,"score":0.27559998631477356},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.27239999175071716},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.27129998803138733},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.2628999948501587},{"id":"https://openalex.org/C2779200073","wikidata":"https://www.wikidata.org/wiki/Q18395575","display_name":"Visual masking","level":4,"score":0.2581999897956848},{"id":"https://openalex.org/C124066611","wikidata":"https://www.wikidata.org/wiki/Q28684319","display_name":"Sparse approximation","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.25519999861717224}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.29634","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29634","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.29634","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29634","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7696993350982666,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Continuous":[0],"image":[1,60],"tokenizers":[2],"enable":[3],"efficient":[4],"visual":[5,102],"generation,":[6],"and":[7,62,71,83,107,140,161,168],"those":[8],"based":[9],"on":[10],"variational":[11],"frameworks":[12],"can":[13],"learn":[14],"smooth,":[15],"structured":[16],"latent":[17,44,81,121],"representations":[18],"through":[19],"KL":[20],"regularization.":[21],"Yet":[22],"this":[23],"often":[24],"leads":[25],"to":[26,37,65,79,87,96,154],"posterior":[27,166],"collapse":[28,67,167],"when":[29],"using":[30],"fewer":[31],"tokens,":[32],"where":[33],"the":[34,42,94],"encoder":[35],"fails":[36],"encode":[38,97],"informative":[39,89],"features":[40],"into":[41],"compressed":[43,119],"space.":[45],"To":[46],"address":[47],"this,":[48],"we":[49],"introduce":[50],"\\textbf{MacTok},":[51],"a":[52,117,133,141],"\\textbf{M}asked":[53],"\\textbf{A}ugmenting":[54],"1D":[55,120],"\\textbf{C}ontinuous":[56],"\\textbf{Tok}enizer":[57],"that":[58,159],"leverages":[59],"masking":[61,78,86,160],"representation":[63,109],"alignment":[64],"prevent":[66,165],"while":[68,148],"learning":[69,82],"compact":[70],"robust":[72,98],"representations.":[73],"MacTok":[74,111,131],"applies":[75],"both":[76],"random":[77],"regularize":[80],"DINO-guided":[84],"semantic":[85,162],"emphasize":[88],"regions":[90],"in":[91,116],"images,":[92],"forcing":[93],"model":[95],"semantics":[99],"from":[100],"incomplete":[101],"evidence.":[103],"Combined":[104],"with":[105,146],"global":[106],"local":[108],"alignment,":[110],"preserves":[112],"rich":[113],"discriminative":[114],"information":[115],"highly":[118],"space,":[122],"requiring":[123],"only":[124],"64":[125],"or":[126],"128":[127],"tokens.":[128],"On":[129],"ImageNet,":[130],"achieves":[132],"competitive":[134],"gFID":[135],"of":[136],"1.44":[137],"at":[138,144],"256$\\times$256":[139],"state-of-the-art":[142],"1.52":[143],"512$\\times$512":[145],"SiT-XL,":[147],"reducing":[149],"token":[150],"usage":[151],"by":[152],"up":[153],"64$\\times$.":[155],"These":[156],"results":[157],"confirm":[158],"guidance":[163],"together":[164],"achieve":[169],"efficient,":[170],"high-fidelity":[171],"tokenization.":[172]},"counts_by_year":[],"updated_date":"2026-04-02T13:53:19.096889","created_date":"2026-04-02T00:00:00"}
