{"id":"https://openalex.org/W7084090305","doi":"https://doi.org/10.48550/arxiv.2509.26330","title":"SQUARE: Semantic Query-Augmented Fusion and Efficient Batch Reranking for Training-free Zero-Shot Composed Image Retrieval","display_name":"SQUARE: Semantic Query-Augmented Fusion and Efficient Batch Reranking for Training-free Zero-Shot Composed Image Retrieval","publication_year":2025,"publication_date":"2025-09-30","ids":{"openalex":"https://openalex.org/W7084090305","doi":"https://doi.org/10.48550/arxiv.2509.26330"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2509.26330","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.26330","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2509.26330","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Wu, Ren-Di","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wu, Ren-Di","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lin, Yu-Yen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Yu-Yen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Yang, Huei-Fang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Huei-Fang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T12157","display_name":"Geochemistry and Geologic Mapping","score":0.4284000098705292,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12157","display_name":"Geochemistry and Geologic Mapping","score":0.4284000098705292,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13177","display_name":"Geological and Geophysical Studies","score":0.044199999421834946,"subfield":{"id":"https://openalex.org/subfields/1907","display_name":"Geology"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13067","display_name":"Geological Modeling and Analysis","score":0.030799999833106995,"subfield":{"id":"https://openalex.org/subfields/1906","display_name":"Geochemistry and Petrology"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/image-retrieval","display_name":"Image retrieval","score":0.6323000192642212},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6025999784469604},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4948999881744385},{"id":"https://openalex.org/keywords/visual-word","display_name":"Visual Word","score":0.40849998593330383},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.37599998712539673},{"id":"https://openalex.org/keywords/image-fusion","display_name":"Image fusion","score":0.3758000135421753},{"id":"https://openalex.org/keywords/grid","display_name":"Grid","score":0.3463999927043915},{"id":"https://openalex.org/keywords/simplicity","display_name":"Simplicity","score":0.34130001068115234}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8707000017166138},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.6323000192642212},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6025999784469604},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5357000231742859},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5105000138282776},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4948999881744385},{"id":"https://openalex.org/C189391414","wikidata":"https://www.wikidata.org/wiki/Q7936579","display_name":"Visual Word","level":4,"score":0.40849998593330383},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.37599998712539673},{"id":"https://openalex.org/C69744172","wikidata":"https://www.wikidata.org/wiki/Q860822","display_name":"Image fusion","level":3,"score":0.3758000135421753},{"id":"https://openalex.org/C187691185","wikidata":"https://www.wikidata.org/wiki/Q2020720","display_name":"Grid","level":2,"score":0.3463999927043915},{"id":"https://openalex.org/C2776372474","wikidata":"https://www.wikidata.org/wiki/Q508291","display_name":"Simplicity","level":2,"score":0.34130001068115234},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.31619998812675476},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3100000023841858},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3005000054836273},{"id":"https://openalex.org/C99016210","wikidata":"https://www.wikidata.org/wiki/Q5488129","display_name":"Query expansion","level":2,"score":0.2888000011444092},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C161156560","wikidata":"https://www.wikidata.org/wiki/Q1638872","display_name":"Document retrieval","level":2,"score":0.2799000144004822},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.2667999863624573},{"id":"https://openalex.org/C2779532271","wikidata":"https://www.wikidata.org/wiki/Q445558","display_name":"Relevance feedback","level":4,"score":0.2637999951839447},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.26249998807907104},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.259799987077713}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2509.26330","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.26330","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2509.26330","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.26330","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Composed":[0],"Image":[1],"Retrieval":[2],"(CIR)":[3],"aims":[4],"to":[5,64,104,133],"retrieve":[6],"target":[7,93],"images":[8],"that":[9,57,159],"preserve":[10],"the":[11,68,76,92,102,107,116,134],"visual":[12,131],"content":[13],"of":[14,91],"a":[15,52,81,149],"reference":[16],"image":[17,128],"while":[18],"incorporating":[19],"user-specified":[20],"textual":[21],"modifications.":[22],"Training-free":[23],"zero-shot":[24],"CIR":[25,172],"(ZS-CIR)":[26],"approaches,":[27],"which":[28,136],"require":[29],"no":[30],"task-specific":[31],"training":[32],"or":[33],"labeled":[34],"data,":[35],"are":[36,124],"highly":[37],"desirable,":[38],"yet":[39],"accurately":[40],"capturing":[41],"user":[42],"intent":[43,109],"remains":[44],"challenging.":[45],"In":[46,67,115],"this":[47],"paper,":[48],"we":[49,74],"present":[50],"SQUARE,":[51,160],"novel":[53],"two-stage":[54],"training-free":[55],"framework":[56],"leverages":[58],"Multimodal":[59],"Large":[60],"Language":[61],"Models":[62],"(MLLMs)":[63],"enhance":[65],"ZS-CIR.":[66],"Semantic":[69],"Query-Augmented":[70],"Fusion":[71],"(SQAF)":[72],"stage,":[73,121],"enrich":[75],"query":[77,103],"embedding":[78],"derived":[79],"from":[80],"vision-language":[82],"model":[83],"(VLM)":[84],"such":[85],"as":[86,126],"CLIP":[87],"with":[88,130,161,180],"MLLM-generated":[89],"captions":[90,96],"image.":[94],"These":[95],"provide":[97],"high-level":[98],"semantic":[99],"guidance,":[100],"enabling":[101],"better":[105],"capture":[106],"user's":[108],"and":[110,152,164],"improve":[111],"global":[112],"retrieval":[113],"quality.":[114],"Efficient":[117],"Batch":[118],"Reranking":[119],"(EBR)":[120],"top-ranked":[122],"candidates":[123],"presented":[125],"an":[127],"grid":[129],"marks":[132],"MLLM,":[135],"performs":[137],"joint":[138],"visual-semantic":[139],"reasoning":[140],"across":[141],"all":[142],"candidates.":[143],"Our":[144],"reranking":[145],"strategy":[146],"operates":[147],"in":[148],"single":[150],"pass":[151],"yields":[153],"more":[154],"accurate":[155],"rankings.":[156],"Experiments":[157],"show":[158],"its":[162,184],"simplicity":[163],"effectiveness,":[165],"delivers":[166],"strong":[167],"performance":[168,178],"on":[169],"four":[170],"standard":[171],"benchmarks.":[173],"Notably,":[174],"it":[175],"maintains":[176],"high":[177],"even":[179],"lightweight":[181],"pre-trained,":[182],"demonstrating":[183],"potential":[185],"applicability.":[186]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
