{"id":"https://openalex.org/W4415540081","doi":"https://doi.org/10.1145/3746027.3755235","title":"From Semantics, Scene to Instance-awareness: Distilling Foundation Model for Open-vocabulary Grounded Situation Recognition","display_name":"From Semantics, Scene to Instance-awareness: Distilling Foundation Model for Open-vocabulary Grounded Situation Recognition","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415540081","doi":"https://doi.org/10.1145/3746027.3755235"},"language":"en","primary_location":{"id":"doi:10.1145/3746027.3755235","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3755235","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746027.3755235","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5037994542","display_name":"Chen Cai","orcid":"https://orcid.org/0009-0002-7793-5261"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Chen Cai","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0009-0002-7793-5261","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062422211","display_name":"Tianyi Liu","orcid":"https://orcid.org/0000-0002-6705-7808"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Tianyi Liu","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-6705-7808","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034411608","display_name":"Jianjun Gao","orcid":"https://orcid.org/0009-0004-9137-2869"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Jianjun Gao","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0009-0004-9137-2869","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101735168","display_name":"Wenyang Liu","orcid":"https://orcid.org/0009-0004-3226-0920"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Wenyang Liu","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0009-0004-3226-0920","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102012979","display_name":"Kejun Wu","orcid":"https://orcid.org/0000-0001-9859-9573"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kejun Wu","raw_affiliation_strings":["EIC, Huazhong University of Science and Technology, Wuhan, China"],"raw_orcid":"https://orcid.org/0000-0001-9859-9573","affiliations":[{"raw_affiliation_string":"EIC, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Ruoyu Wang","orcid":"https://orcid.org/0009-0000-6749-0305"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Ruoyu Wang","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0009-0000-6749-0305","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100784213","display_name":"Yi Wang","orcid":"https://orcid.org/0000-0001-8659-4724"},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yi Wang","raw_affiliation_strings":["EEE, The Hong Kong Polytechnic University, Hong Kong SAR, China"],"raw_orcid":"https://orcid.org/0000-0001-8659-4724","affiliations":[{"raw_affiliation_string":"EEE, The Hong Kong Polytechnic University, Hong Kong SAR, China","institution_ids":["https://openalex.org/I14243506"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040257549","display_name":"Soo Chin Liew","orcid":"https://orcid.org/0000-0001-8342-4682"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Soo Chin Liew","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0001-8342-4682","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5037994542"],"corresponding_institution_ids":["https://openalex.org/I165932596"],"apc_list":null,"apc_paid":null,"fwci":0.971,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.81045273,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"392","last_page":"401"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.987500011920929,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9853000044822693,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.7802000045776367},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.6717000007629395},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.6547999978065491},{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.5928000211715698},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5771999955177307},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5386000275611877},{"id":"https://openalex.org/keywords/generator","display_name":"Generator (circuit theory)","score":0.5073000192642212},{"id":"https://openalex.org/keywords/grounded-theory","display_name":"Grounded theory","score":0.4871000051498413}],"concepts":[{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.7802000045776367},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6847000122070312},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.6717000007629395},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.6547999978065491},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5945000052452087},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.5928000211715698},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5885000228881836},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5771999955177307},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5386000275611877},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.5073000192642212},{"id":"https://openalex.org/C156325361","wikidata":"https://www.wikidata.org/wiki/Q1152864","display_name":"Grounded theory","level":3,"score":0.4871000051498413},{"id":"https://openalex.org/C2779916870","wikidata":"https://www.wikidata.org/wiki/Q14467155","display_name":"Gaze","level":2,"score":0.48030000925064087},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.44190001487731934},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3849000036716461},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.36149999499320984},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.3411000072956085},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3366999924182892},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32659998536109924},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.29899999499320984},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2906000018119812},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.27880001068115234},{"id":"https://openalex.org/C2778137410","wikidata":"https://www.wikidata.org/wiki/Q2732820","display_name":"Government (linguistics)","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.2635999917984009},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2533000111579895},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3746027.3755235","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3755235","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:dr.ntu.edu.sg:10356/202561","is_oa":false,"landing_page_url":"https://hdl.handle.net/10356/202561","pdf_url":null,"source":{"id":"https://openalex.org/S4306402609","display_name":"DR-NTU (Nanyang Technological University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I172675005","host_organization_name":"Nanyang Technological University","host_organization_lineage":["https://openalex.org/I172675005"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":null,"raw_type":"Conference Paper"}],"best_oa_location":{"id":"doi:10.1145/3746027.3755235","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3755235","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W3096609285","https://openalex.org/W3096682293","https://openalex.org/W3109754877","https://openalex.org/W4312651322","https://openalex.org/W4385571870","https://openalex.org/W4390873744","https://openalex.org/W4390874575","https://openalex.org/W4393148771","https://openalex.org/W4393159464","https://openalex.org/W4396606321","https://openalex.org/W4396736086","https://openalex.org/W4401485877","https://openalex.org/W4402783842"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"Multimodal":[1,79,160],"Large":[2],"Language":[3],"Models":[4],"(MLLMs)":[5],"exhibit":[6],"strong":[7],"zero-shot":[8,62],"abilities":[9],"but":[10],"struggle":[11],"with":[12,134,150],"complex":[13],"Grounded":[14,70],"Situation":[15,71],"Recognition":[16,72],"(GSR)":[17],"and":[18,38,61,105,127,130,141,168,199,202,223,226],"are":[19,144],"resource-intensive":[20],"for":[21,188],"edge":[22],"device":[23],"deployment.":[24],"Meanwhile,":[25],"conventional":[26],"GSR":[27,55],"models":[28],"often":[29],"lack":[30],"generalization":[31,60,189],"ability,":[32],"falling":[33],"short":[34],"in":[35,206],"recognizing":[36],"unseen":[37,103,200,224,230],"rare":[39,110,207],"situations.":[40,111],"In":[41],"this":[42],"paper,":[43],"we":[44,77],"exploit":[45],"transferring":[46],"knowledge":[47,91,176],"from":[48,92,153],"a":[49,53,84,185],"teacher":[50,156],"MLLM":[51,155],"to":[52,57,101,124,147],"small":[54],"model":[56,100],"enhance":[58],"its":[59],"abilities,":[63],"thereby":[64],"introducing":[65],"the":[66,93,97,113,118,154,158,173,180,195,213,233],"task":[67],"of":[68,109],"Open-vocabulary":[69],"(Ov-GSR).":[73],"To":[74],"achieve":[75],"this,":[76],"propose":[78],"Interactive":[80],"Prompt":[81],"Distillation":[82],"(MIPD),":[83],"novel":[85],"framework":[86,115],"that":[87,190],"distills":[88],"enriched":[89,133],"multimodal":[90,170,175],"foundation":[94,187],"model,":[95,183],"enabling":[96],"student":[98,181],"Ov-GSR":[99,182],"recognize":[102],"situations":[104],"be":[106],"better":[107],"aware":[108],"Specifically,":[112],"MIPD":[114,211],"first":[116],"leverages":[117],"LLM-based":[119],"Judgmental":[120],"Rationales":[121],"Generator":[122],"(JRG)":[123],"construct":[125],"positive":[126],"negative":[128],"glimpse":[129],"gaze":[131],"rationales":[132,149],"contextual":[135],"semantic":[136],"information.":[137],"The":[138],"proposed":[139],"scene-aware":[140],"instance-perception":[142],"prompts":[143],"then":[145],"introduced":[146],"align":[148],"visual":[151],"information":[152],"via":[157],"Negative-Guided":[159],"Prompting":[161],"Alignment":[162],"(NMPA)":[163],"module,":[164],"effectively":[165],"capturing":[166],"holistic":[167],"perceptual":[169],"knowledge.":[171],"Finally,":[172],"aligned":[174],"is":[177],"distilled":[178],"into":[179],"providing":[184],"stronger":[186],"enhances":[191],"situation":[192],"understanding,":[193],"bridges":[194],"gap":[196],"between":[197],"seen":[198],"scenarios,":[201],"mitigates":[203],"prediction":[204],"bias":[205],"cases.":[208],"We":[209],"evaluate":[210],"on":[212,220,232],"refined":[214],"Ov-SWiG":[215],"dataset,":[216],"achieving":[217],"superior":[218],"performance":[219],"seen,":[221],"rare,":[222],"situations,":[225],"further":[227],"demonstrate":[228],"improved":[229],"detection":[231],"HICO-DET":[234],"dataset.":[235]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-02T09:04:35.204637","created_date":"2025-10-25T00:00:00"}
