{"id":"https://openalex.org/W4403780516","doi":"https://doi.org/10.1145/3664647.3680680","title":"Towards Open-vocabulary HOI Detection with Calibrated Vision-language Models and Locality-aware Queries","display_name":"Towards Open-vocabulary HOI Detection with Calibrated Vision-language Models and Locality-aware Queries","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4403780516","doi":"https://doi.org/10.1145/3664647.3680680"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3680680","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3680680","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102522900","display_name":"Zhenhao Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhenhao Yang","raw_affiliation_strings":["University of Electronic Science and Technology of China, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109753028","display_name":"Xin Liu","orcid":"https://orcid.org/0009-0003-2802-0556"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Liu","raw_affiliation_strings":["University of Electronic Science and Technology of China, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065413088","display_name":"Deqiang Ouyang","orcid":"https://orcid.org/0000-0003-2259-886X"},"institutions":[{"id":"https://openalex.org/I158842170","display_name":"Chongqing University","ror":"https://ror.org/023rhb549","country_code":"CN","type":"education","lineage":["https://openalex.org/I158842170"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Deqiang Ouyang","raw_affiliation_strings":["Chongqing University, Chongqing, China"],"affiliations":[{"raw_affiliation_string":"Chongqing University, Chongqing, China","institution_ids":["https://openalex.org/I158842170"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081385896","display_name":"Guiduo Duan","orcid":"https://orcid.org/0000-0003-4448-6906"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guiduo Duan","raw_affiliation_strings":["University of Electronic Science and Technology of China, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020073761","display_name":"Dongyang Zhang","orcid":"https://orcid.org/0000-0002-4839-0234"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dongyang Zhang","raw_affiliation_strings":["University of Electronic Science and Technology of China, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100761497","display_name":"Tao He","orcid":"https://orcid.org/0000-0001-8676-7429"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao He","raw_affiliation_strings":["University of Electronic Science and Technology of China, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017943466","display_name":"Yuan-Fang Li","orcid":"https://orcid.org/0000-0003-4651-2821"},"institutions":[{"id":"https://openalex.org/I56590836","display_name":"Monash University","ror":"https://ror.org/02bfwt286","country_code":"AU","type":"education","lineage":["https://openalex.org/I56590836"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Yuan-Fang Li","raw_affiliation_strings":["Monash University, Melbourne, Australia"],"affiliations":[{"raw_affiliation_string":"Monash University, Melbourne, Australia","institution_ids":["https://openalex.org/I56590836"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5102522900"],"corresponding_institution_ids":["https://openalex.org/I150229711"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18445129,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1495","last_page":"1504"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9847999811172485,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/locality","display_name":"Locality","score":0.8654082417488098},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7921392917633057},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.7262237071990967},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4375922977924347},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.43294844031333923},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.42670977115631104},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3449675440788269},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.14397719502449036}],"concepts":[{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.8654082417488098},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7921392917633057},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.7262237071990967},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4375922977924347},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.43294844031333923},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.42670977115631104},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3449675440788269},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.14397719502449036},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3664647.3680680","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3680680","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7900000214576721,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W2194775991","https://openalex.org/W2728700534","https://openalex.org/W2746726611","https://openalex.org/W2963351448","https://openalex.org/W2997572149","https://openalex.org/W3049440217","https://openalex.org/W3096609285","https://openalex.org/W3101631402","https://openalex.org/W3109754877","https://openalex.org/W3138516171","https://openalex.org/W3168279596","https://openalex.org/W3173178959","https://openalex.org/W3173859428","https://openalex.org/W3174164794","https://openalex.org/W3195860412","https://openalex.org/W3197395033","https://openalex.org/W4285133744","https://openalex.org/W4312343844","https://openalex.org/W4312446811","https://openalex.org/W4312563428","https://openalex.org/W4312574495","https://openalex.org/W4312848112","https://openalex.org/W4312873085","https://openalex.org/W4312960102","https://openalex.org/W4382240040","https://openalex.org/W4386065740","https://openalex.org/W4386071869","https://openalex.org/W4386072519","https://openalex.org/W4386076271","https://openalex.org/W4386076501","https://openalex.org/W4386472893","https://openalex.org/W4388579765","https://openalex.org/W4390873645","https://openalex.org/W4390874582","https://openalex.org/W4393159464"],"related_works":["https://openalex.org/W1556451512","https://openalex.org/W1555349535","https://openalex.org/W4234091740","https://openalex.org/W4213350282","https://openalex.org/W2230171082","https://openalex.org/W2583128298","https://openalex.org/W2022275305","https://openalex.org/W1604115909","https://openalex.org/W2369125128","https://openalex.org/W2134423494"],"abstract_inverted_index":{"The":[0],"open-vocabulary":[1,100,140,156],"human-object":[2,15,68,141],"interaction":[3,42,142],"(Ov-HOI)":[4],"detection":[5,102,158],"aims":[6],"to":[7,34,38,73],"identify":[8],"both":[9],"base":[10,19,77],"and":[11,153],"novel":[12,67,128],"categories":[13,20,78],"of":[14,66,126],"interactions":[16],"while":[17],"only":[18],"are":[21],"available":[22],"during":[23],"training.":[24],"Existing":[25],"Ov-HOI":[26],"methods":[27,146],"commonly":[28],"leverage":[29],"knowledge":[30],"distilled":[31],"from":[32,112],"CLIP":[33,55,115],"extend":[35],"their":[36,80],"ability":[37],"detect":[39],"previously":[40],"unseen":[41],"categories.":[43],"However,":[44],"our":[45,120,135],"empirical":[46],"observations":[47],"indicate":[48],"that":[49,134],"the":[50,64,76,108,113,117,124,166,170,174,178],"inherent":[51],"noise":[52],"present":[53],"in":[54,139],"has":[56],"a":[57,89],"detrimental":[58],"effect":[59],"on":[60,75,150,165,173],"HOI":[61,101,110,118,157],"prediction.":[62],"Moreover,":[63],"absence":[65],"position":[69],"distributions":[70],"often":[71],"leads":[72],"overfitting":[74],"within":[79],"learned":[81],"queries.":[82],"To":[83],"address":[84],"these":[85],"issues,":[86],"we":[87],"propose":[88],"two-step":[90],"framework":[91],"named,":[92],"CaM-LQ,":[93],"Calibrating":[94],"visual-language":[95],"Models,":[96],"(e.g.,":[97],"CLIP)":[98],"for":[99],"with":[103,161,177],"Locality-aware":[104],"Queries.":[105],"By":[106],"injecting":[107],"fine-grained":[109],"supervision":[111],"calibrated":[114],"into":[116],"decoder,":[119],"model":[121],"can":[122],"achieve":[123],"goal":[125],"predicting":[127],"interactions.":[129],"Extensive":[130],"experimental":[131],"results":[132],"demonstrate":[133],"approach":[136],"performs":[137],"well":[138],"detection,":[143],"surpassing":[144],"state-of-the-art":[145],"across":[147],"multiple":[148],"metrics":[149],"mainstream":[151],"datasets":[152],"showing":[154],"superior":[155],"performance,":[159],"e.g.,":[160],"4.54":[162],"points":[163],"improvement":[164],"HICO-DET":[167],"dataset":[168],"over":[169],"SoTA":[171],"CLIP4HOI":[172],"UV":[175],"task":[176],"same":[179],"backbone":[180],"ResNet-50.":[181]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
