{"id":"https://openalex.org/W7123360606","doi":"https://doi.org/10.1109/tip.2025.3644175","title":"RAR: Retrieving and Ranking Augmented MLLMs for Visual Recognition","display_name":"RAR: Retrieving and Ranking Augmented MLLMs for Visual Recognition","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7123360606","doi":"https://doi.org/10.1109/tip.2025.3644175","pmid":"https://pubmed.ncbi.nlm.nih.gov/41525633"},"language":"en","primary_location":{"id":"doi:10.1109/tip.2025.3644175","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tip.2025.3644175","pdf_url":null,"source":{"id":"https://openalex.org/S4210173141","display_name":"IEEE Transactions on Image Processing","issn_l":"1057-7149","issn":["1057-7149","1941-0042"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Image Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100350070","display_name":"Ziyu Liu","orcid":"https://orcid.org/0009-0005-4103-7824"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ziyu Liu","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103086415","display_name":"Zeyi Sun","orcid":"https://orcid.org/0009-0008-4264-4281"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zeyi Sun","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122852095","display_name":"Yuhang Zang","orcid":null},"institutions":[{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuhang Zang","raw_affiliation_strings":["Shanghai AI Laboratory, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai AI Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I4391012619"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100317994","display_name":"Wei Li","orcid":"https://orcid.org/0000-0001-7015-7335"},"institutions":[{"id":"https://openalex.org/I4210115515","display_name":"Nanyang Institute of Technology","ror":"https://ror.org/0203c2755","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210115515"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Li","raw_affiliation_strings":["Nanyang Technological University, Nanyang, China"],"affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Nanyang, China","institution_ids":["https://openalex.org/I4210115515"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122895378","display_name":"Pan Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pan Zhang","raw_affiliation_strings":["Shanghai AI Laboratory, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai AI Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I4391012619"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055238399","display_name":"Xiaoyi Dong","orcid":"https://orcid.org/0000-0002-4654-835X"},"institutions":[{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyi Dong","raw_affiliation_strings":["Shanghai AI Laboratory, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai AI Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I4391012619"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005925571","display_name":"Yuanjun Xiong","orcid":"https://orcid.org/0000-0002-6391-4921"},"institutions":[{"id":"https://openalex.org/I110813580","display_name":"St. Andrews University","ror":"https://ror.org/00nb5xj61","country_code":"US","type":"education","lineage":["https://openalex.org/I110813580"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yuanjun Xiong","raw_affiliation_strings":["MThreads Inc, Beijing, China"],"affiliations":[{"raw_affiliation_string":"MThreads Inc, Beijing, China","institution_ids":["https://openalex.org/I110813580"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121132945","display_name":"Dahua Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Dahua Lin","raw_affiliation_strings":["The Chinese University of Hong Kong, Shatin, China"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong, Shatin, China","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5122879367","display_name":"Jiaqi Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiaqi Wang","raw_affiliation_strings":["Shanghai AI Laboratory, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai AI Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I4391012619"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5100350070"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.08063803,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"35","issue":null,"first_page":"388","last_page":"401"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7181000113487244,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7181000113487244,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.13040000200271606,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.030500000342726707,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/boosting","display_name":"Boosting (machine learning)","score":0.6248999834060669},{"id":"https://openalex.org/keywords/cognitive-neuroscience-of-visual-object-recognition","display_name":"Cognitive neuroscience of visual object recognition","score":0.6194999814033508},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.5952000021934509},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5110999941825867},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.46320000290870667},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.45669999718666077},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.39010000228881836},{"id":"https://openalex.org/keywords/rank","display_name":"Rank (graph theory)","score":0.3785000145435333},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3693000078201294}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.814300000667572},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6733999848365784},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.6248999834060669},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.6194999814033508},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.5952000021934509},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5110999941825867},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.46320000290870667},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4580000042915344},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.45669999718666077},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.39010000228881836},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.3785000145435333},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3693000078201294},{"id":"https://openalex.org/C86037889","wikidata":"https://www.wikidata.org/wiki/Q4330127","display_name":"Learning to rank","level":3,"score":0.35659998655319214},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.34700000286102295},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.3312000036239624},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.30649998784065247},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3043000102043152},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.30239999294281006},{"id":"https://openalex.org/C75294576","wikidata":"https://www.wikidata.org/wiki/Q5165192","display_name":"Contextual image classification","level":3,"score":0.29589998722076416},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.28299999237060547},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2741999924182892},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C121687571","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Activity recognition","level":2,"score":0.2694999873638153},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2646999955177307},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2646999955177307},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.2540000081062317}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/tip.2025.3644175","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tip.2025.3644175","pdf_url":null,"source":{"id":"https://openalex.org/S4210173141","display_name":"IEEE Transactions on Image Processing","issn_l":"1057-7149","issn":["1057-7149","1941-0042"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Image Processing","raw_type":"journal-article"},{"id":"pmid:41525633","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41525633","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on image processing : a publication of the IEEE Signal Processing Society","raw_type":null},{"id":"pmh:oai:dr.ntu.edu.sg:10356/212437","is_oa":false,"landing_page_url":"https://hdl.handle.net/10356/212437","pdf_url":null,"source":{"id":"https://openalex.org/S4306402609","display_name":"DR-NTU (Nanyang Technological University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I172675005","host_organization_name":"Nanyang Technological University","host_organization_lineage":["https://openalex.org/I172675005"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Journal Article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.8548775911331177,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W1977295328","https://openalex.org/W2017814585","https://openalex.org/W2047643928","https://openalex.org/W2081580037","https://openalex.org/W2108598243","https://openalex.org/W2138011018","https://openalex.org/W2155904486","https://openalex.org/W2533598788","https://openalex.org/W2738672149","https://openalex.org/W2948672349","https://openalex.org/W2963469388","https://openalex.org/W2964194231","https://openalex.org/W3198377975","https://openalex.org/W4226048133","https://openalex.org/W4285192809","https://openalex.org/W4287887100","https://openalex.org/W4310557340","https://openalex.org/W4312420092","https://openalex.org/W4312424618","https://openalex.org/W4312563428","https://openalex.org/W4312956471","https://openalex.org/W4386075561","https://openalex.org/W4386075997","https://openalex.org/W4386076084","https://openalex.org/W4386076522","https://openalex.org/W4386289268","https://openalex.org/W4386790226","https://openalex.org/W4390871915","https://openalex.org/W4390873481","https://openalex.org/W4390874207","https://openalex.org/W4415124086","https://openalex.org/W4415795395"],"related_works":[],"abstract_inverted_index":{"CLIP":[0,122],"(Contrastive":[1],"Language-Image":[2],"Pre-training)":[3],"uses":[4,150],"contrastive":[5],"learning":[6],"from":[7,52,146],"noise":[8],"image-text":[9],"pairs":[10],"to":[11,48,71,123,152],"excel":[12,42],"at":[13,43],"recognizing":[14],"a":[15,106,117,183,193],"wide":[16],"array":[17],"of":[18,60,76,85,185],"candidates,":[19],"yet":[20],"its":[21],"focus":[22],"on":[23,54,121,198],"broad":[24],"associations":[25],"hinders":[26],"the":[27,58,83,90,133,141,147,156,165,174,210,216],"precision":[28],"in":[29,66,168,196],"distinguishing":[30],"subtle":[31],"differences":[32],"among":[33],"fine-grained":[34,45,100,169,200],"items.":[35],"Conversely,":[36],"Multimodal":[37],"Large":[38],"Language":[39],"Models":[40],"(MLLMs)":[41],"classifying":[44],"categories,":[46],"thanks":[47],"their":[49],"substantial":[50],"knowledge":[51,177],"pre-training":[53],"web-level":[55],"corpora.":[56],"However,":[57],"performance":[59,197],"MLLMs":[61,151],"declines":[62],"with":[63],"an":[64],"increase":[65],"category":[67],"numbers,":[68],"primarily":[69],"due":[70],"growing":[72],"complexity":[73],"and":[74,88,99,125,149,154,209],"constraints":[75],"limited":[77],"context":[78,135],"window":[79],"size.":[80],"To":[81],"synergize":[82],"strengths":[84],"both":[86],"approaches":[87],"enhance":[89],"few-shot/zero-shot":[91],"recognition":[92,170,187,202,207,218],"abilities":[93],"for":[94,112,129],"datasets":[95,214],"characterized":[96],"by":[97],"extensive":[98],"vocabularies,":[101],"this":[102],"paper":[103],"introduces":[104],"RAR,":[105],"Retrieving":[107],"And":[108],"Ranking":[109],"augmented":[110],"method":[111],"MLLMs.":[113],"We":[114],"initially":[115],"establish":[116],"multi-modal":[118],"retriever":[119],"based":[120],"create":[124],"store":[126],"explicit":[127],"memory":[128,148],"different":[130],"categories":[131],"beyond":[132],"immediate":[134],"window.":[136],"During":[137],"inference,":[138],"RAR":[139],"retrieves":[140],"top-":[142],"$k$":[143],"similar":[144],"results":[145],"rank":[153],"make":[155],"final":[157],"predictions.":[158],"Our":[159],"proposed":[160],"approach":[161,191],"not":[162],"only":[163],"addresses":[164],"inherent":[166],"limitations":[167],"but":[171],"also":[172],"preserves":[173],"model's":[175],"comprehensive":[176],"base,":[178],"significantly":[179],"boosting":[180],"accuracy":[181],"across":[182],"range":[184],"vision-language":[186],"tasks.":[188],"Notably,":[189],"our":[190],"demonstrates":[192],"significant":[194],"improvement":[195],"5":[199],"visual":[201],"benchmarks,":[203],"11":[204],"few-shot":[205],"image":[206],"datasets,":[208],"2":[211],"object":[212],"detection":[213],"under":[215],"zero-shot":[217],"setting.":[219]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-04T08:04:53.788161","created_date":"2026-01-14T00:00:00"}
