{"id":"https://openalex.org/W4304084237","doi":"https://doi.org/10.1145/3503161.3548107","title":"Token Embeddings Alignment for Cross-Modal Retrieval","display_name":"Token Embeddings Alignment for Cross-Modal Retrieval","publication_year":2022,"publication_date":"2022-10-10","ids":{"openalex":"https://openalex.org/W4304084237","doi":"https://doi.org/10.1145/3503161.3548107"},"language":"en","primary_location":{"id":"doi:10.1145/3503161.3548107","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3548107","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040830437","display_name":"Chen-Wei Xie","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chen-Wei Xie","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020280128","display_name":"Jianmin Wu","orcid":"https://orcid.org/0000-0002-0999-9194"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianmin Wu","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058744406","display_name":"Yun Zheng","orcid":"https://orcid.org/0000-0002-1612-7191"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yun Zheng","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100667153","display_name":"Pan Pan","orcid":"https://orcid.org/0000-0001-5828-0234"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pan Pan","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5024965898","display_name":"Xian\u2010Sheng Hua","orcid":"https://orcid.org/0000-0002-8232-5049"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xian-Sheng Hua","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5040830437"],"corresponding_institution_ids":["https://openalex.org/I45928872"],"apc_list":null,"apc_paid":null,"fwci":0.8989,"has_fulltext":false,"cited_by_count":15,"citation_normalized_percentile":{"value":0.82582347,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"4555","last_page":"4563"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.9063550233840942},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8296102285385132},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5998892188072205},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5960570573806763},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5513012409210205},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.5374069809913635},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.53532475233078},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5051105618476868},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.48880669474601746},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4732193052768707},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.45510733127593994},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4368613362312317},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.4154309630393982},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.40162408351898193},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.331557959318161},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.2567986845970154}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.9063550233840942},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8296102285385132},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5998892188072205},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5960570573806763},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5513012409210205},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.5374069809913635},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.53532475233078},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5051105618476868},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.48880669474601746},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4732193052768707},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.45510733127593994},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4368613362312317},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.4154309630393982},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.40162408351898193},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.331557959318161},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2567986845970154},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3503161.3548107","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3548107","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.7300000190734863,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2108598243","https://openalex.org/W2250384498","https://openalex.org/W2277195237","https://openalex.org/W2606473278","https://openalex.org/W2886641317","https://openalex.org/W2892181857","https://openalex.org/W2962843773","https://openalex.org/W2963703197","https://openalex.org/W2991118492","https://openalex.org/W2997786945","https://openalex.org/W2998702515","https://openalex.org/W3090449556","https://openalex.org/W3091588028","https://openalex.org/W3094502228","https://openalex.org/W3097619042","https://openalex.org/W3135367836","https://openalex.org/W3156892778","https://openalex.org/W3166304536","https://openalex.org/W3170767867","https://openalex.org/W3174010726","https://openalex.org/W3176641147","https://openalex.org/W3177224328","https://openalex.org/W3177654849","https://openalex.org/W3184784418","https://openalex.org/W3202384916"],"related_works":["https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W4288261899","https://openalex.org/W4307309205","https://openalex.org/W2967478618","https://openalex.org/W4385009901","https://openalex.org/W4385572700","https://openalex.org/W2997152889","https://openalex.org/W2081900870","https://openalex.org/W4304700937"],"abstract_inverted_index":{"Cross-modal":[0],"retrieval":[1,117,169],"has":[2],"achieved":[3],"significant":[4],"progress":[5],"in":[6,143],"recent":[7],"years":[8],"with":[9],"the":[10,33,66,147],"help":[11],"of":[12,26],"token":[13,25],"embeddings":[14,35],"interaction":[15],"methods.":[16],"Most":[17],"existing":[18],"methods":[19,62],"first":[20,86],"extract":[21],"embedding":[22],"for":[23],"each":[24],"input":[27,56,104],"image":[28,57,105],"and":[29,58,70,91,106,123],"text,":[30],"then":[31,94],"feed":[32],"token-level":[34,96],"into":[36],"a":[37,42,78,137],"multi-modal":[38],"transformer":[39],"to":[40,51,99,128],"learn":[41],"joint":[43,46],"representation,":[44],"this":[45,74,158],"representation":[47],"can":[48],"be":[49],"used":[50,115],"predict":[52],"matching":[53,97],"score":[54],"between":[55,68,103],"text.":[59,107],"However,":[60],"these":[61],"don't":[63],"explicitly":[64,87],"supervise":[65],"alignment":[67],"visual":[69,89],"textual":[71,92],"tokens.":[72],"In":[73],"paper,":[75],"we":[76,124,135],"propose":[77],"novel":[79],"Token":[80],"Embeddings":[81],"AlignMent":[82],"(TEAM)":[83],"block,":[84],"it":[85,131],"aligns":[88],"tokens":[90],"tokens,":[93],"produces":[95],"scores":[98],"measure":[100],"fine-grained":[101],"similarity":[102],"TEAM":[108,120],"achieves":[109,163],"new":[110,138],"state-of-the-art":[111,164],"performance":[112,165],"on":[113,157,166],"commonly":[114],"cross-modal":[116,168],"benchmarks.":[118,170],"Moreover,":[119],"is":[121,146],"interpretable":[122],"provide":[125],"visualization":[126],"experiments":[127],"show":[129],"how":[130],"works.":[132],"At":[133],"last,":[134],"construct":[136],"billion-scale":[139],"vision-language":[140,150],"pre-training":[141,151,156],"dataset":[142,152],"Chinese,":[144],"which":[145],"largest":[148],"Chinese":[149,167],"so":[153],"far.":[154],"After":[155],"dataset,":[159],"our":[160],"framework":[161],"also":[162]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
