{"id":"https://openalex.org/W4410086715","doi":"https://doi.org/10.1109/tcds.2025.3566649","title":"Exploring Grounding Abilities in Vision-Language Models Through Contextual Perception","display_name":"Exploring Grounding Abilities in Vision-Language Models Through Contextual Perception","publication_year":2025,"publication_date":"2025-05-05","ids":{"openalex":"https://openalex.org/W4410086715","doi":"https://doi.org/10.1109/tcds.2025.3566649"},"language":"en","primary_location":{"id":"doi:10.1109/tcds.2025.3566649","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcds.2025.3566649","pdf_url":null,"source":{"id":"https://openalex.org/S2488537894","display_name":"IEEE Transactions on Cognitive and Developmental Systems","issn_l":"2379-8920","issn":["2379-8920","2379-8939"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Cognitive and Developmental Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110543464","display_name":"Wei Xu","orcid":"https://orcid.org/0009-0001-8568-5680"},"institutions":[{"id":"https://openalex.org/I163340411","display_name":"Hohai University","ror":"https://ror.org/01wd4xt90","country_code":"CN","type":"education","lineage":["https://openalex.org/I163340411"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wei Xu","raw_affiliation_strings":["College of Artificial Intelligence and Automation, Hohai University, Changzhou, China"],"raw_orcid":"https://orcid.org/0009-0001-8568-5680","affiliations":[{"raw_affiliation_string":"College of Artificial Intelligence and Automation, Hohai University, Changzhou, China","institution_ids":["https://openalex.org/I163340411"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091518967","display_name":"Tianfei Zhou","orcid":"https://orcid.org/0000-0001-5475-1473"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianfei Zhou","raw_affiliation_strings":["Department of Computer Science, Beijing Institute of Technology, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5475-1473","affiliations":[{"raw_affiliation_string":"Department of Computer Science, Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Taoyuan Zhang","orcid":"https://orcid.org/0009-0006-3054-3905"},"institutions":[{"id":"https://openalex.org/I200753234","display_name":"De Anza College","ror":"https://ror.org/01dfsmx55","country_code":"US","type":"education","lineage":["https://openalex.org/I200753234"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Taoyuan Zhang","raw_affiliation_strings":["De Anza College, Cupertino, CA, USA"],"raw_orcid":"https://orcid.org/0009-0006-3054-3905","affiliations":[{"raw_affiliation_string":"De Anza College, Cupertino, CA, USA","institution_ids":["https://openalex.org/I200753234"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080670840","display_name":"Jie Li","orcid":"https://orcid.org/0000-0002-2977-8559"},"institutions":[{"id":"https://openalex.org/I4210112069","display_name":"Theranostics (New Zealand)","ror":"https://ror.org/02wykja76","country_code":"NZ","type":"company","lineage":["https://openalex.org/I4210112069"]},{"id":"https://openalex.org/I83519826","display_name":"Nanjing Medical University","ror":"https://ror.org/059gcgy73","country_code":"CN","type":"education","lineage":["https://openalex.org/I83519826"]}],"countries":["CN","NZ"],"is_corresponding":false,"raw_author_name":"Jie Li","raw_affiliation_strings":["Engineering Research Center of Intelligent Theranostics Technology and Instruments, Ministry of Education, Nanjing, China","Engineering Research Center of Intelligent Theranostics Technology and Instruments, Ministry of Education, Nanjing Medical University, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0002-2977-8559","affiliations":[{"raw_affiliation_string":"Engineering Research Center of Intelligent Theranostics Technology and Instruments, Ministry of Education, Nanjing, China","institution_ids":["https://openalex.org/I4210112069"]},{"raw_affiliation_string":"Engineering Research Center of Intelligent Theranostics Technology and Instruments, Ministry of Education, Nanjing Medical University, Nanjing, China","institution_ids":["https://openalex.org/I83519826"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048975243","display_name":"Peiyin Chen","orcid":"https://orcid.org/0000-0001-8636-526X"},"institutions":[{"id":"https://openalex.org/I163340411","display_name":"Hohai University","ror":"https://ror.org/01wd4xt90","country_code":"CN","type":"education","lineage":["https://openalex.org/I163340411"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peiyin Chen","raw_affiliation_strings":["College of Artificial Intelligence and Automation, Hohai University, Changzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-8636-526X","affiliations":[{"raw_affiliation_string":"College of Artificial Intelligence and Automation, Hohai University, Changzhou, China","institution_ids":["https://openalex.org/I163340411"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076812698","display_name":"Jia Pan","orcid":"https://orcid.org/0000-0001-9003-2054"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Jia Pan","raw_affiliation_strings":["Department of Computer Science, University of Hong Kong, Hong Kong, China"],"raw_orcid":"https://orcid.org/0000-0001-9003-2054","affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Hong Kong, Hong Kong, China","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100359300","display_name":"Xiaofeng Liu","orcid":"https://orcid.org/0000-0003-1310-6739"},"institutions":[{"id":"https://openalex.org/I163340411","display_name":"Hohai University","ror":"https://ror.org/01wd4xt90","country_code":"CN","type":"education","lineage":["https://openalex.org/I163340411"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaofeng Liu","raw_affiliation_strings":["College of Artificial Intelligence and Automation, Hohai University, Changzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-1310-6739","affiliations":[{"raw_affiliation_string":"College of Artificial Intelligence and Automation, Hohai University, Changzhou, China","institution_ids":["https://openalex.org/I163340411"]}]}],"institutions":[],"countries_distinct_count":4,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5110543464"],"corresponding_institution_ids":["https://openalex.org/I163340411"],"apc_list":null,"apc_paid":null,"fwci":1.1332,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.77387611,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"17","issue":"6","first_page":"1461","last_page":"1473"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9059000015258789,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9059000015258789,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8068004250526428},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.680799663066864},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.5756851434707642},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.4471582770347595},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.4124971032142639},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3883543312549591},{"id":"https://openalex.org/keywords/cognitive-psychology","display_name":"Cognitive psychology","score":0.36336493492126465},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.34460949897766113},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.33971208333969116},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.13666999340057373}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8068004250526428},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.680799663066864},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.5756851434707642},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4471582770347595},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.4124971032142639},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3883543312549591},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.36336493492126465},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.34460949897766113},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33971208333969116},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.13666999340057373},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcds.2025.3566649","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcds.2025.3566649","pdf_url":null,"source":{"id":"https://openalex.org/S2488537894","display_name":"IEEE Transactions on Cognitive and Developmental Systems","issn_l":"2379-8920","issn":["2379-8920","2379-8939"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Cognitive and Developmental Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1774469180","display_name":null,"funder_award_id":"2018AAA0100803","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G1831201353","display_name":null,"funder_award_id":"62203150","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3822855674","display_name":null,"funder_award_id":"BZ2024061","funder_id":"https://openalex.org/F4320327777","funder_display_name":"Jiangsu Provincial Key Research and Development Program"},{"id":"https://openalex.org/G4241470453","display_name":null,"funder_award_id":"62406102","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4823643176","display_name":null,"funder_award_id":"62276090","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7452483644","display_name":null,"funder_award_id":"BE2022160","funder_id":"https://openalex.org/F4320327777","funder_display_name":"Jiangsu Provincial Key Research and Development Program"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320327777","display_name":"Jiangsu Provincial Key Research and Development Program","ror":null},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W1976546217","https://openalex.org/W2111868822","https://openalex.org/W2120321538","https://openalex.org/W2152161678","https://openalex.org/W2159564241","https://openalex.org/W2251512949","https://openalex.org/W2799215407","https://openalex.org/W2883820570","https://openalex.org/W2895403383","https://openalex.org/W2931882224","https://openalex.org/W2955813853","https://openalex.org/W3035066680","https://openalex.org/W3119381934","https://openalex.org/W3160447289","https://openalex.org/W4285060105","https://openalex.org/W4309674289","https://openalex.org/W4312709198","https://openalex.org/W4365131747","https://openalex.org/W4382465657","https://openalex.org/W4383097638","https://openalex.org/W4383108457","https://openalex.org/W4386057783","https://openalex.org/W4386066126","https://openalex.org/W4386075493","https://openalex.org/W4386075561","https://openalex.org/W4389520756","https://openalex.org/W4390871819","https://openalex.org/W4390873481","https://openalex.org/W4390874575","https://openalex.org/W4390874582","https://openalex.org/W4391407102","https://openalex.org/W4394828156","https://openalex.org/W4396505862","https://openalex.org/W4399053202","https://openalex.org/W4401056676","https://openalex.org/W4401414087","https://openalex.org/W4401770219","https://openalex.org/W4402727137","https://openalex.org/W4402727764","https://openalex.org/W4402754134","https://openalex.org/W4402890475","https://openalex.org/W4402951641","https://openalex.org/W4403923795","https://openalex.org/W4404612908","https://openalex.org/W4404722477","https://openalex.org/W4405786114","https://openalex.org/W4407097799","https://openalex.org/W4410536678"],"related_works":["https://openalex.org/W2021787609","https://openalex.org/W1537063595","https://openalex.org/W2097328689","https://openalex.org/W4234899305","https://openalex.org/W2379604501","https://openalex.org/W2373854414","https://openalex.org/W2574906695","https://openalex.org/W2522183581","https://openalex.org/W2954371137","https://openalex.org/W2120744156"],"abstract_inverted_index":{"Vision":[0,57],"language":[1,75],"models":[2],"(VLMs)":[3],"have":[4],"demonstrated":[5],"strong":[6,161],"general":[7],"capabilities":[8,51],"and":[9,19,71,77,99,103,121,166,184],"achieved":[10],"great":[11],"success":[12],"in":[13,92,134,154,163],"areas":[14],"such":[15],"as":[16],"image":[17],"understanding":[18],"reasoning.":[20],"Visual":[21],"prompts":[22],"enhance":[23],"the":[24,49,78,142,145],"focus":[25],"of":[26,52,73,81,125,186],"VLMs":[27,82],"on":[28,144],"designated":[29],"areas,":[30],"but":[31],"their":[32],"fine-grained":[33],"grounding":[34,50,79,152],"has":[35,42,66,131],"not":[36],"been":[37],"fully":[38],"developed.":[39],"Recent":[40],"research":[41],"used":[43],"Set-of-Mark":[44,110],"(SoM)":[45],"approach":[46,194],"to":[47,88,141],"unleash":[48],"Generative":[53],"Pre-trained":[54],"Transformer-4":[55],"with":[56,68],"(GPT-4V),":[58],"achieving":[59],"significant":[60],"benchmark":[61],"performance.":[62],"However,":[63],"SoM":[64],"still":[65],"problems":[67],"label":[69],"offset":[70],"hallucination":[72],"vision":[74],"models,":[76],"ability":[80],"remains":[83],"limited,":[84],"making":[85],"it":[86,159],"challenging":[87],"handle":[89],"complex":[90,164],"scenarios":[91],"human-robot":[93,197],"interaction.":[94],"To":[95],"address":[96],"these":[97],"limitations":[98],"provide":[100],"more":[101],"accurate":[102],"less":[104],"hallucinatory":[105],"results,":[106],"we":[107,149],"propose":[108],"Contextual":[109],"(ConSoM),":[111],"a":[112,173,191],"new":[113],"SoM-based":[114],"prompting":[115],"mechanism":[116],"that":[117,129],"leverages":[118],"dual-image":[119],"inputs":[120],"contextual":[122],"semantic":[123],"information":[124],"images.":[126],"Experiments":[127],"demonstrate":[128],"ConSoM":[130,190],"distinct":[132],"advantages":[133],"visual":[135],"grounding,":[136],"improving":[137],"by":[138],"11%":[139],"compared":[140],"baseline":[143],"dataset":[146],"Refcocog.":[147],"Furthermore,":[148],"evaluated":[150],"ConSoM\u2019s":[151],"abilities":[153],"five":[155],"indoor":[156],"scenarios,":[157],"where":[158],"exhibited":[160],"robustness":[162],"environments":[165],"under":[167],"occlusion":[168],"conditions.":[169],"We":[170],"also":[171],"introduced":[172],"scalable":[174],"annotation":[175],"method":[176],"for":[177,195],"pixel-level":[178],"question-answering":[179],"dataset.":[180],"The":[181],"accuracy,":[182],"scalability,":[183],"depth":[185],"world":[187],"knowledge":[188],"make":[189],"highly":[192],"effective":[193],"future":[196],"interactions.":[198]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
