{"id":"https://openalex.org/W4412376905","doi":"https://doi.org/10.1145/3726302.3730077","title":"Revolutionizing Text-to-Image Retrieval as Autoregressive Token-to-Voken Generation","display_name":"Revolutionizing Text-to-Image Retrieval as Autoregressive Token-to-Voken Generation","publication_year":2025,"publication_date":"2025-07-13","ids":{"openalex":"https://openalex.org/W4412376905","doi":"https://doi.org/10.1145/3726302.3730077"},"language":"en","primary_location":{"id":"doi:10.1145/3726302.3730077","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3726302.3730077","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3726302.3730077","source":null,"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3726302.3730077","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101599249","display_name":"Yongqi Li","orcid":"https://orcid.org/0000-0002-6932-4228"},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Yongqi Li","raw_affiliation_strings":["The Hong Kong Polytechnic University, Hong Kong SAR, China"],"raw_orcid":"https://orcid.org/0000-0002-6932-4228","affiliations":[{"raw_affiliation_string":"The Hong Kong Polytechnic University, Hong Kong SAR, China","institution_ids":["https://openalex.org/I14243506"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043285342","display_name":"Hongru Cai","orcid":"https://orcid.org/0009-0007-9857-6639"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Hongru Cai","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0009-0007-9857-6639","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100368524","display_name":"Wenjie Wang","orcid":"https://orcid.org/0000-0002-5199-1428"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenjie Wang","raw_affiliation_strings":["University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-5199-1428","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010816782","display_name":"Leigang Qu","orcid":"https://orcid.org/0009-0004-6555-3834"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Leigang Qu","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0009-0004-6555-3834","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039731055","display_name":"Yinwei Wei","orcid":"https://orcid.org/0000-0003-1791-3159"},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yinwei Wei","raw_affiliation_strings":["Shandong University, Shandong, China"],"raw_orcid":"https://orcid.org/0000-0003-1791-3159","affiliations":[{"raw_affiliation_string":"Shandong University, Shandong, China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100408983","display_name":"Wenjie Li","orcid":"https://orcid.org/0000-0002-7360-8864"},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Wenjie Li","raw_affiliation_strings":["The Hong Kong Polytechnic University, Hong Kong SAR, China"],"raw_orcid":"https://orcid.org/0000-0002-7360-8864","affiliations":[{"raw_affiliation_string":"The Hong Kong Polytechnic University, Hong Kong SAR, China","institution_ids":["https://openalex.org/I14243506"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038612499","display_name":"Liqiang Nie","orcid":"https://orcid.org/0000-0003-1476-0273"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liqiang Nie","raw_affiliation_strings":["Harbin Institute of Technology, Shenzhen, China"],"raw_orcid":"https://orcid.org/0000-0003-1476-0273","affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Shenzhen, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5089404640","display_name":"Tat\u2010Seng Chua","orcid":"https://orcid.org/0000-0001-6097-7807"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Tat-Seng Chua","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0001-6097-7807","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5101599249"],"corresponding_institution_ids":["https://openalex.org/I14243506"],"apc_list":null,"apc_paid":null,"fwci":4.533,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.94873325,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"813","last_page":"822"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9961000084877014,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.7561260461807251},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.751245379447937},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7083723545074463},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5174594521522522},{"id":"https://openalex.org/keywords/image-retrieval","display_name":"Image retrieval","score":0.478263258934021},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43220755457878113},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.34430134296417236},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.13429409265518188},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.11387506127357483},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.09232094883918762}],"concepts":[{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.7561260461807251},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.751245379447937},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7083723545074463},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5174594521522522},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.478263258934021},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43220755457878113},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.34430134296417236},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.13429409265518188},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.11387506127357483},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.09232094883918762}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3726302.3730077","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3726302.3730077","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3726302.3730077","source":null,"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3726302.3730077","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3726302.3730077","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3726302.3730077","source":null,"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.47999998927116394,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320322598","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412376905.pdf","grobid_xml":"https://content.openalex.org/works/W4412376905.grobid-xml"},"referenced_works_count":29,"referenced_works":["https://openalex.org/W1530404542","https://openalex.org/W1861492603","https://openalex.org/W2185175083","https://openalex.org/W2610930722","https://openalex.org/W2765440071","https://openalex.org/W2883311563","https://openalex.org/W2962964995","https://openalex.org/W2967957126","https://openalex.org/W2981852735","https://openalex.org/W2988823324","https://openalex.org/W3010277541","https://openalex.org/W3035454331","https://openalex.org/W3092820619","https://openalex.org/W3118694826","https://openalex.org/W3155230099","https://openalex.org/W3175888430","https://openalex.org/W3180355996","https://openalex.org/W4236965008","https://openalex.org/W4288089799","https://openalex.org/W4312974539","https://openalex.org/W4367060669","https://openalex.org/W4385571319","https://openalex.org/W4385934203","https://openalex.org/W4387967913","https://openalex.org/W4387968117","https://openalex.org/W4387969378","https://openalex.org/W4388191759","https://openalex.org/W4402671616","https://openalex.org/W4404600716"],"related_works":["https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W4288261899","https://openalex.org/W1986902711","https://openalex.org/W2396760013","https://openalex.org/W2148433556","https://openalex.org/W2171776552","https://openalex.org/W98391849","https://openalex.org/W1600907701","https://openalex.org/W2726741344"],"abstract_inverted_index":{"Text-to-image":[0],"retrieval":[1,74,113,146],"is":[2,52],"a":[3,16,32,77],"fundamental":[4],"task":[5,14,75],"in":[6,31,41,46],"multimedia":[7],"retrieval.Traditional":[8],"studies":[9],"have":[10],"typically":[11],"approached":[12],"this":[13,60,96],"as":[15],"discriminative":[17,150],"problem,":[18,80],"matching":[19],"the":[20,25,49,72,112,116,130,138,145,154,162,171,178,191],"text":[21],"and":[22,68,107,133,144,183],"image":[23],"via":[24],"cross-attention":[26],"mechanism":[27],"(one-tower":[28],"framework)":[29],"or":[30],"common":[33],"embedding":[34],"space":[35],"(two-tower":[36],"framework).The":[37],"one-tower":[38],"framework":[39,51],"excels":[40],"effectiveness":[42,67,88,181],"but":[43,54],"falls":[44],"short":[45],"efficiency,":[47],"whereas":[48],"two-tower":[50,173],"efficient":[53],"struggles":[55],"to":[56,64,86,136,152],"maintain":[57],"competitive":[58],"effectiveness.In":[59],"study,":[61],"we":[62,118],"aim":[63],"enhance":[65],"both":[66,129],"efficiency":[69,186],"by":[70],"transforming":[71],"text-to-image":[73],"into":[76,124],"token-to-voken":[78,158],"generation":[79],"where":[81],"fine-grained":[82],"interactions":[83],"are":[84,167],"incorporated":[85],"improve":[87],"while":[89,126],"maintaining":[90],"high":[91],"efficiency.Despite":[92],"its":[93],"potential":[94],"advantages,":[95],"paradigm":[97,165],"shift":[98],"presents":[99],"significant":[100],"challenges:":[101],"1)":[102],"misalignment":[103],"with":[104,128,170],"high-level":[105,134],"semantics":[106],"2)":[108],"learning":[109,139,155],"gap":[110,140],"towards":[111],"target.To":[114],"address":[115],"challenges,":[117],"propose":[119],"AVG,":[120],"which":[121],"discretizes":[122],"images":[123],"vokens":[125],"aligning":[127],"visual":[131],"information":[132],"semantics.Additionally,":[135],"bridge":[137],"between":[141],"generative":[142],"training":[143,151],"target,":[147],"AVG":[148,176],"incorporates":[149],"modify":[153],"direction":[156],"during":[157],"training.Experiments":[159],"demonstrate":[160],"that":[161],"benefits":[163],"of":[164],"innovation":[166],"realized:":[168],"compared":[169],"classical":[172],"method,":[174],"CLIP,":[175],"achieves":[177],"7.53%":[179],"relative":[180],"improvement":[182],"also":[184],"4":[185],"improvement.We":[187],"release":[188],"code":[189],"at":[190],"GitHub":[192],"repository.":[193]},"counts_by_year":[{"year":2025,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
