{"id":"https://openalex.org/W4402982584","doi":"https://doi.org/10.1109/icme57554.2024.10688227","title":"VG-Annotator: Vision-Language Models as Query Annotators for Unsupervised Visual Grounding","display_name":"VG-Annotator: Vision-Language Models as Query Annotators for Unsupervised Visual Grounding","publication_year":2024,"publication_date":"2024-07-15","ids":{"openalex":"https://openalex.org/W4402982584","doi":"https://doi.org/10.1109/icme57554.2024.10688227"},"language":"en","primary_location":{"id":"doi:10.1109/icme57554.2024.10688227","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icme57554.2024.10688227","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5091465907","display_name":"Jiabo Ye","orcid":"https://orcid.org/0009-0009-5451-8984"},"institutions":[{"id":"https://openalex.org/I66867065","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67","country_code":"CN","type":"education","lineage":["https://openalex.org/I66867065"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiabo Ye","raw_affiliation_strings":["East China Normal University"],"affiliations":[{"raw_affiliation_string":"East China Normal University","institution_ids":["https://openalex.org/I66867065"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027249698","display_name":"Junfeng Tian","orcid":"https://orcid.org/0000-0001-6508-6050"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junfeng Tian","raw_affiliation_strings":["Nyonic.ai"],"affiliations":[{"raw_affiliation_string":"Nyonic.ai","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083991825","display_name":"Xiaoshan Yang","orcid":"https://orcid.org/0000-0001-5453-9755"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoshan Yang","raw_affiliation_strings":["CASIA"],"affiliations":[{"raw_affiliation_string":"CASIA","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032572801","display_name":"Zhenru Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhenru Zhang","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045871240","display_name":"Anwen Hu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Anwen Hu","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000844861","display_name":"Ming Yan","orcid":"https://orcid.org/0000-0002-4388-6708"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ming Yan","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055621875","display_name":"Ji Zhang","orcid":"https://orcid.org/0000-0003-4799-5220"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ji Zhang","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010540039","display_name":"Liang He","orcid":"https://orcid.org/0000-0002-4723-5486"},"institutions":[{"id":"https://openalex.org/I66867065","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67","country_code":"CN","type":"education","lineage":["https://openalex.org/I66867065"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liang He","raw_affiliation_strings":["East China Normal University"],"affiliations":[{"raw_affiliation_string":"East China Normal University","institution_ids":["https://openalex.org/I66867065"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103028717","display_name":"Xin Lin","orcid":"https://orcid.org/0009-0008-4110-8989"},"institutions":[{"id":"https://openalex.org/I66867065","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67","country_code":"CN","type":"education","lineage":["https://openalex.org/I66867065"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Lin","raw_affiliation_strings":["East China Normal University"],"affiliations":[{"raw_affiliation_string":"East China Normal University","institution_ids":["https://openalex.org/I66867065"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5091465907"],"corresponding_institution_ids":["https://openalex.org/I66867065"],"apc_list":null,"apc_paid":null,"fwci":0.2624,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.54339623,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9577000141143799,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.940500020980835,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8319886922836304},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6522236466407776},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.607714056968689},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.48887866735458374},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.04291480779647827}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8319886922836304},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6522236466407776},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.607714056968689},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.48887866735458374},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.04291480779647827},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme57554.2024.10688227","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icme57554.2024.10688227","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W1536680647","https://openalex.org/W1773149199","https://openalex.org/W2251512949","https://openalex.org/W2302086703","https://openalex.org/W2489434015","https://openalex.org/W2745461083","https://openalex.org/W2962749469","https://openalex.org/W2962764817","https://openalex.org/W2963109634","https://openalex.org/W2963614783","https://openalex.org/W2963800628","https://openalex.org/W2981663434","https://openalex.org/W2984194315","https://openalex.org/W2987401211","https://openalex.org/W2989176720","https://openalex.org/W3035644209","https://openalex.org/W3094502228","https://openalex.org/W3107094551","https://openalex.org/W3126391825","https://openalex.org/W3169483174","https://openalex.org/W3174966920","https://openalex.org/W3178418424","https://openalex.org/W3179041377","https://openalex.org/W3207127495","https://openalex.org/W4214490042","https://openalex.org/W4225517085","https://openalex.org/W4312351586","https://openalex.org/W4312501707","https://openalex.org/W4313145013","https://openalex.org/W4385574358","https://openalex.org/W4391451889","https://openalex.org/W4395091069"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2021787609","https://openalex.org/W2390279801","https://openalex.org/W2097328689","https://openalex.org/W2358668433","https://openalex.org/W4234899305","https://openalex.org/W4396701345","https://openalex.org/W1537063595","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Visual":[0],"grounding":[1,53,166],"focuses":[2],"on":[3,19,44,65,177],"localizing":[4],"objects":[5,35,71,116],"referred":[6],"to":[7,33,69,94,109,130,144,149],"by":[8,36],"natural":[9,29,118],"language":[10,23,30,73,90,162],"queries.":[11,74,163],"Existing":[12],"fully":[13],"and":[14,72],"weakly":[15],"supervised":[16,141],"methods":[17,62,176],"rely":[18,64],"a":[20,49,125,140],"mass":[21],"of":[22,78,127],"queries":[24,31,172],"for":[25,114],"training.":[26],"However,":[27],"collecting":[28],"corresponding":[32],"specific":[34,115],"annotators":[37],"is":[38],"expensive.":[39],"To":[40,120],"reduce":[41],"the":[42,59,95,146,151,157,170],"reliance":[43],"human-written":[45],"queries,":[46],"we":[47,105,123],"propose":[48],"novel":[50],"unsupervised":[51,61,175],"visual":[52,165],"framework":[54],"named":[55],"VG-Annotator.":[56],"Different":[57],"from":[58,102],"existing":[60],"that":[63,82,156],"manually":[66],"designed":[67],"rules":[68],"link":[70],"The":[75,164],"key":[76],"idea":[77],"VG-Annotator":[79],"lies":[80],"in":[81,117],"vision-language":[83,147],"pre-trained":[84],"(VLP)":[85],"generation":[86],"models":[87,108,148],"can":[88],"be":[89,135],"query":[91],"annotators.":[92],"Thanks":[93],"powerful":[96],"multi-modal":[97,128],"understanding":[98],"ability":[99],"implicitly":[100],"learned":[101],"large-scale":[103],"pre-training,":[104],"consider":[106],"stimulating":[107],"explicitly":[110],"generate":[111],"appropriate":[112],"descriptions":[113],"language.":[119],"this":[121],"end,":[122],"explore":[124],"series":[126],"instructions":[129],"indicate":[131],"which":[132],"object":[133],"should":[134],"described.":[136],"We":[137],"also":[138],"introduce":[139],"fine-tuning":[142],"process":[143],"teach":[145],"follow":[150],"instructions.":[152],"Extensive":[153],"experiments":[154],"show":[155],"proposed":[158],"method":[159],"obtains":[160],"high-quality":[161],"model":[167],"trained":[168],"with":[169],"generated":[171],"outperforms":[173],"state-of-the-art":[174],"five":[178],"widely":[179],"used":[180],"datasets.":[181]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-27T23:08:20.325037","created_date":"2025-10-10T00:00:00"}
