{"id":"https://openalex.org/W4407900581","doi":"https://doi.org/10.1109/tmm.2025.3543066","title":"Multi-Modal Reference Learning for Fine-Grained Text-to-Image Retrieval","display_name":"Multi-Modal Reference Learning for Fine-Grained Text-to-Image Retrieval","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4407900581","doi":"https://doi.org/10.1109/tmm.2025.3543066"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2025.3543066","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3543066","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2504.07718","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023181336","display_name":"Zehong Ma","orcid":"https://orcid.org/0009-0005-1533-2651"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zehong Ma","raw_affiliation_strings":["State Key Laboratory of Multimedia Information Processing, School of Computer Science, Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Multimedia Information Processing, School of Computer Science, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100353581","display_name":"Hao Chen","orcid":"https://orcid.org/0000-0002-6853-3298"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Chen","raw_affiliation_strings":["State Key Laboratory of Multimedia Information Processing, School of Computer Science, Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Multimedia Information Processing, School of Computer Science, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112122358","display_name":"Wei Zeng","orcid":"https://orcid.org/0009-0007-2337-3255"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Zeng","raw_affiliation_strings":["State Key Laboratory of Multimedia Information Processing, School of Computer Science, Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Multimedia Information Processing, School of Computer Science, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101642467","display_name":"Limin Su","orcid":"https://orcid.org/0000-0002-4693-5237"},"institutions":[{"id":"https://openalex.org/I114234892","display_name":"Beijing Union University","ror":"https://ror.org/01hg31662","country_code":"CN","type":"education","lineage":["https://openalex.org/I114234892"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Limin Su","raw_affiliation_strings":["Beijing Union University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Union University, Beijing, China","institution_ids":["https://openalex.org/I114234892"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055433405","display_name":"Shiliang Zhang","orcid":"https://orcid.org/0000-0001-9053-9314"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiliang Zhang","raw_affiliation_strings":["State Key Laboratory of Multimedia Information Processing, School of Computer Science, Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Multimedia Information Processing, School of Computer Science, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5023181336"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":7.9799,"has_fulltext":true,"cited_by_count":6,"citation_normalized_percentile":{"value":0.97222194,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":"27","issue":null,"first_page":"5009","last_page":"5022"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9909999966621399,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8898592591285706},{"id":"https://openalex.org/keywords/image-retrieval","display_name":"Image retrieval","score":0.6551980972290039},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5725822448730469},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5690160393714905},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5182880759239197},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.44034868478775024},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.38090670108795166},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.32599541544914246}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8898592591285706},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.6551980972290039},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5725822448730469},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5690160393714905},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5182880759239197},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.44034868478775024},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.38090670108795166},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.32599541544914246},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tmm.2025.3543066","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3543066","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2504.07718","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2504.07718","pdf_url":"https://arxiv.org/pdf/2504.07718","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2504.07718","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2504.07718","pdf_url":"https://arxiv.org/pdf/2504.07718","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.41999998688697815,"id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G3637388763","display_name":null,"funder_award_id":"U20B2052","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3854062921","display_name":null,"funder_award_id":"62402013","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4828104523","display_name":null,"funder_award_id":"2023M730056","funder_id":"https://openalex.org/F4320321543","funder_display_name":"China Postdoctoral Science Foundation"},{"id":"https://openalex.org/G6658023004","display_name":null,"funder_award_id":"61936011","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7148651236","display_name":null,"funder_award_id":"62236006","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320321543","display_name":"China Postdoctoral Science Foundation","ror":"https://ror.org/0426zh255"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4407900581.pdf","grobid_xml":"https://content.openalex.org/works/W4407900581.grobid-xml"},"referenced_works_count":58,"referenced_works":["https://openalex.org/W1905882502","https://openalex.org/W1957706851","https://openalex.org/W2194775991","https://openalex.org/W2204750386","https://openalex.org/W2398118205","https://openalex.org/W2533598788","https://openalex.org/W2605102252","https://openalex.org/W2731757110","https://openalex.org/W2883311563","https://openalex.org/W2884366600","https://openalex.org/W2885709146","https://openalex.org/W2894786240","https://openalex.org/W2896457183","https://openalex.org/W2947319827","https://openalex.org/W2962964995","https://openalex.org/W2963047834","https://openalex.org/W2963882743","https://openalex.org/W2967515867","https://openalex.org/W2985951359","https://openalex.org/W2997421053","https://openalex.org/W3005530336","https://openalex.org/W3015686580","https://openalex.org/W3034202663","https://openalex.org/W3034727830","https://openalex.org/W3088460296","https://openalex.org/W3095440956","https://openalex.org/W3106778652","https://openalex.org/W3120797281","https://openalex.org/W3125650101","https://openalex.org/W3165835426","https://openalex.org/W3196835757","https://openalex.org/W3205604798","https://openalex.org/W3206868111","https://openalex.org/W3210554596","https://openalex.org/W4212836813","https://openalex.org/W4220813980","https://openalex.org/W4225159440","https://openalex.org/W4285214376","https://openalex.org/W4294170691","https://openalex.org/W4297821751","https://openalex.org/W4297833440","https://openalex.org/W4304087170","https://openalex.org/W4312359729","https://openalex.org/W4312998013","https://openalex.org/W4313144963","https://openalex.org/W4321231469","https://openalex.org/W4378697123","https://openalex.org/W4385895960","https://openalex.org/W4385934203","https://openalex.org/W4387448536","https://openalex.org/W4387968135","https://openalex.org/W4388145471","https://openalex.org/W4389331836","https://openalex.org/W4389934400","https://openalex.org/W4393252676","https://openalex.org/W4402727545","https://openalex.org/W4402754290","https://openalex.org/W4411245246"],"related_works":["https://openalex.org/W2379392295","https://openalex.org/W3160965418","https://openalex.org/W613940353","https://openalex.org/W2320915480","https://openalex.org/W2362990116","https://openalex.org/W2381300099","https://openalex.org/W2714992399","https://openalex.org/W2383812217","https://openalex.org/W2326515389","https://openalex.org/W2491005386"],"abstract_inverted_index":{"Fine-grained":[0],"text-to-image":[1,155,160],"retrieval":[2,102,146,156,161,179],"aims":[3],"to":[4,38,46,64,76,113,117,137],"retrieve":[5],"a":[6,11,59,71,88,106,128,139],"fine-grained":[7,154],"target":[8],"image":[9,22,178],"with":[10],"given":[12],"text":[13,55],"query.":[14],"Existing":[15],"methods":[16],"typically":[17],"assume":[18],"that":[19,132,142],"each":[20],"training":[21],"is":[23,111],"accurately":[24],"depicted":[25],"by":[26,194],"its":[27],"textual":[28,31,81,123],"descriptions.":[29],"However,":[30],"descriptions":[32],"can":[33],"be":[34],"ambiguous":[35],"and":[36,80,101,122],"fail":[37],"depict":[39],"discriminative":[40],"visual":[41,79,121],"details":[42,82],"in":[43],"images,":[44],"leading":[45],"inaccurate":[47],"representation":[48,99,108],"learning.":[49],"To":[50],"alleviate":[51],"the":[52,84,97,134,144,176,185,191],"effects":[53],"of":[54,83,188],"ambiguity,":[56],"we":[57,126],"propose":[58,70],"Multi-Modal":[60],"Reference":[61],"learning":[62,100,109],"framework":[63],"learn":[65,118],"robust":[66],"representations.":[67,124],"We":[68],"first":[69],"multi-modal":[72,90,93,115],"reference":[73,94],"construction":[74],"module":[75,110],"aggregate":[77],"all":[78],"same":[85],"object":[86,135],"into":[87],"comprehensive":[89],"reference.":[91],"The":[92,163],"hence":[95],"facilitates":[96],"subsequent":[98],"similarity":[103,141],"computation.":[104],"Specifically,":[105],"reference-guided":[107],"proposed":[112,164],"use":[114],"references":[116,136],"more":[119],"accurate":[120],"Additionally,":[125],"introduce":[127],"reference-based":[129,140],"refinement":[130],"method":[131,165,183],"employs":[133],"compute":[138],"refines":[143],"initial":[145],"results.":[147],"Extensive":[148],"experiments":[149],"are":[150],"conducted":[151],"on":[152,175],"five":[153],"datasets":[157],"for":[158],"different":[159],"tasks.":[162],"has":[166],"achieved":[167],"superior":[168],"performance":[169],"over":[170],"state-of-the-art":[171],"methods.":[172],"For":[173],"instance,":[174],"text-to-person":[177],"dataset":[180],"RSTPReid,":[181],"our":[182],"achieves":[184],"Rank1":[186],"accuracy":[187],"56.2%,":[189],"surpassing":[190],"recent":[192],"CFine":[193],"5.6%.":[195]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":4}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
