{"id":"https://openalex.org/W4405355469","doi":"https://doi.org/10.1109/iccv51701.2025.01596","title":"Spectral Image Tokenizer","display_name":"Spectral Image Tokenizer","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4405355469","doi":"https://doi.org/10.1109/iccv51701.2025.01596"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.01596","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01596","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2412.09607","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030999390","display_name":"Carlos Esteves","orcid":"https://orcid.org/0000-0001-9413-1201"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Carlos Esteves","raw_affiliation_strings":["Google Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101542156","display_name":"Mohammed Suhail","orcid":"https://orcid.org/0000-0002-3130-8919"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mohammed Suhail","raw_affiliation_strings":["Google Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5087850633","display_name":"Ameesh Makadia","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ameesh Makadia","raw_affiliation_strings":["Google Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5030999390"],"corresponding_institution_ids":["https://openalex.org/I1291425158"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.00123287,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"17181","last_page":"17190"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T14339","display_name":"Image Processing and 3D Reconstruction","score":0.9502000212669373,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T14339","display_name":"Image Processing and 3D Reconstruction","score":0.9502000212669373,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9139999747276306,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5959266424179077},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.3670305013656616},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.34101778268814087}],"concepts":[{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5959266424179077},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3670305013656616},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.34101778268814087}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.01596","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01596","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2412.09607","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.09607","pdf_url":"https://arxiv.org/pdf/2412.09607","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2412.09607","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2412.09607","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2412.09607","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.09607","pdf_url":"https://arxiv.org/pdf/2412.09607","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Image":[0],"tokenizers":[1],"map":[2],"images":[3,84,97],"to":[4,48,156],"sequences":[5],"of":[6,14,65,98,113,120,128,148],"discrete":[7,57],"tokens,":[8],"and":[9,95,177],"are":[10,21,85],"a":[11,56,71,116,125,145],"crucial":[12],"component":[13],"autoregressive":[15,41,154],"transformer-based":[16],"image":[17,51,69,160,172,175],"generation.":[18],"The":[19],"tokens":[20,66,142],"typically":[22],"associated":[23],"with":[24],"spatial":[25],"locations":[26],"in":[27,32,70],"the":[28,50,63,68,106,121,129,138,149,164],"input":[29],"image,":[30,122,131,150],"arranged":[31],"raster":[33],"scan":[34],"order,":[35],"which":[36],"is":[37],"not":[38],"ideal":[39],"for":[40,108,159],"modeling.":[42],"In":[43],"this":[44],"paper,":[45],"we":[46],"propose":[47],"tokenize":[49],"spectrum":[52],"instead,":[53],"obtained":[54],"from":[55],"wavelet":[58],"transform":[59],"(DWT),":[60],"such":[61],"that":[62,82],"sequence":[64],"represents":[67],"coarse-to-fine":[72],"fashion.":[73],"Our":[74],"tokenizer":[75,165],"brings":[76],"several":[77],"advantages:":[78],"1)":[79],"it":[80,92,104,123,133,152],"leverages":[81],"natural":[83],"more":[86],"compressible":[87],"at":[88],"high":[89],"frequencies,":[90],"2)":[91],"can":[93,143],"take":[94],"reconstruct":[96,144],"different":[99],"resolutions":[100],"without":[101],"retraining,":[102],"3)":[103],"improves":[105],"conditioning":[107,114],"next-token":[109],"prediction":[110],"--":[111],"instead":[112],"on":[115],"partial":[117,135],"line-by-line":[118],"reconstruction":[119,127,166],"takes":[124],"coarse":[126,146],"full":[130],"4)":[132],"enables":[134,153],"decoding":[136],"where":[137],"first":[139],"few":[140],"generated":[141],"version":[147],"5)":[151],"models":[155],"be":[157],"used":[158],"upsampling.":[161],"We":[162],"evaluate":[163],"metrics":[167],"as":[168,170],"well":[169],"multiscale":[171],"generation,":[173],"text-guided":[174],"upsampling":[176],"editing.":[178]},"counts_by_year":[],"updated_date":"2026-05-06T06:03:25.996018","created_date":"2025-10-10T00:00:00"}
