{"id":"https://openalex.org/W4404318803","doi":"https://doi.org/10.48550/arxiv.2410.21220","title":"Vision Search Assistant: Empower Vision-Language Models as Multimodal Search Engines","display_name":"Vision Search Assistant: Empower Vision-Language Models as Multimodal Search Engines","publication_year":2024,"publication_date":"2024-10-28","ids":{"openalex":"https://openalex.org/W4404318803","doi":"https://doi.org/10.48550/arxiv.2410.21220"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2410.21220","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.21220","pdf_url":"https://arxiv.org/pdf/2410.21220","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2410.21220","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100643814","display_name":"Zhixin Zhang","orcid":"https://orcid.org/0000-0002-4914-1969"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Zhixin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100437647","display_name":"Yiyuan Zhang","orcid":"https://orcid.org/0009-0009-8107-4688"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yiyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011301418","display_name":"Xiaohan Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Xiaohan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5078165161","display_name":"Xiangyu Yue","orcid":"https://orcid.org/0000-0002-6887-2046"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yue, Xiangyu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100643814"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13277","display_name":"Media, Religion, Digital Communication","score":0.8720999956130981,"subfield":{"id":"https://openalex.org/subfields/1211","display_name":"Philosophy"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13277","display_name":"Media, Religion, Digital Communication","score":0.8720999956130981,"subfield":{"id":"https://openalex.org/subfields/1211","display_name":"Philosophy"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12798","display_name":"Religious Tourism and Spaces","score":0.8424000144004822,"subfield":{"id":"https://openalex.org/subfields/3305","display_name":"Geography, Planning and Development"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5492645502090454},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4923664331436157},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.44640469551086426},{"id":"https://openalex.org/keywords/machine-vision","display_name":"Machine vision","score":0.44034257531166077},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.33872708678245544},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.322542667388916}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5492645502090454},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4923664331436157},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.44640469551086426},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.44034257531166077},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33872708678245544},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.322542667388916}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2410.21220","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.21220","pdf_url":"https://arxiv.org/pdf/2410.21220","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2410.21220","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2410.21220","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2410.21220","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.21220","pdf_url":"https://arxiv.org/pdf/2410.21220","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4404318803.pdf","grobid_xml":"https://content.openalex.org/works/W4404318803.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Search":[0,97,169],"engines":[1],"enable":[2],"the":[3,29,46,53,66,129,140,148,153,167,173],"retrieval":[4],"of":[5],"unknown":[6],"information":[7,121],"with":[8],"texts.":[9],"However,":[10],"traditional":[11],"methods":[12],"fall":[13],"short":[14],"when":[15,147],"it":[16,59],"comes":[17],"to":[18,52,61,65,86,123,152,181],"understanding":[19,115],"unfamiliar":[20],"visual":[21,114,133],"content,":[22],"such":[23],"as":[24,73],"identifying":[25],"an":[26,57],"object":[27,54],"that":[28,70,102,166],"model":[30,47,141],"has":[31,48],"never":[32],"seen":[33],"before.":[34],"This":[35,110],"challenge":[36],"is":[37,83,150],"particularly":[38],"pronounced":[39],"for":[40],"large":[41],"vision-language":[42],"models":[43,175],"(VLMs):":[44],"if":[45],"not":[49],"been":[50],"exposed":[51],"depicted":[55],"in":[56],"image,":[58],"struggles":[60],"generate":[62],"reliable":[63],"answers":[64],"user's":[67],"question":[68],"regarding":[69],"image.":[71],"Moreover,":[72],"new":[74],"objects":[75],"and":[76,107,117,134,161,176],"events":[77],"continuously":[78],"emerge,":[79],"frequently":[80],"updating":[81],"VLMs":[82,106],"impractical":[84],"due":[85],"heavy":[87],"computational":[88],"burdens.":[89],"To":[90],"address":[91],"this":[92,138],"limitation,":[93],"we":[94],"propose":[95],"Vision":[96,168],"Assistant,":[98],"a":[99],"novel":[100,151],"framework":[101],"facilitates":[103],"collaboration":[104],"between":[105],"web":[108,118],"agents.":[109],"approach":[111],"leverages":[112],"VLMs'":[113],"capabilities":[116],"agents'":[119],"real-time":[120],"access":[122],"perform":[124],"open-world":[125],"Retrieval-Augmented":[126],"Generation":[127],"via":[128],"web.":[130],"By":[131],"integrating":[132],"textual":[135],"representations":[136],"through":[137],"collaboration,":[139],"can":[142,177],"provide":[143],"informed":[144],"responses":[145],"even":[146],"image":[149],"system.":[154],"Extensive":[155],"experiments":[156],"conducted":[157],"on":[158],"both":[159],"open-set":[160],"closed-set":[162],"QA":[163],"benchmarks":[164],"demonstrate":[165],"Assistant":[170],"significantly":[171],"outperforms":[172],"other":[174],"be":[178],"widely":[179],"applied":[180],"existing":[182],"VLMs.":[183]},"counts_by_year":[],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2024-11-14T00:00:00"}
