{"id":"https://openalex.org/W7156905782","doi":"https://doi.org/10.48550/arxiv.2604.24036","title":"Robust Grounding with MLLMs Against Occlusion and Small Objects via Language-Guided Semantic Cues","display_name":"Robust Grounding with MLLMs Against Occlusion and Small Objects via Language-Guided Semantic Cues","publication_year":2026,"publication_date":"2026-04-27","ids":{"openalex":"https://openalex.org/W7156905782","doi":"https://doi.org/10.48550/arxiv.2604.24036"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.24036","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24036","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.24036","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134800262","display_name":"Beomchan Park","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Park, Beomchan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106418046","display_name":"Seongho Kim","orcid":"https://orcid.org/0009-0000-5803-6629"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Seongho","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134758509","display_name":"Hyunjun Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Hyunjun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134789722","display_name":"Sungjune Park","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Park, Sungjune","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134791197","display_name":"Yong Man Ro","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ro, Yong Man","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9003999829292297,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9003999829292297,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.02199999988079071,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.015200000256299973,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6345999836921692},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5680999755859375},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5250999927520752},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4361000061035156},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.43309998512268066},{"id":"https://openalex.org/keywords/cognitive-neuroscience-of-visual-object-recognition","display_name":"Cognitive neuroscience of visual object recognition","score":0.4108000099658966},{"id":"https://openalex.org/keywords/sensory-cue","display_name":"Sensory cue","score":0.337799996137619}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7366999983787537},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6557000279426575},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6345999836921692},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5680999755859375},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5250999927520752},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5006999969482422},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.49380001425743103},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4361000061035156},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.43309998512268066},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.4108000099658966},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.337799996137619},{"id":"https://openalex.org/C117978034","wikidata":"https://www.wikidata.org/wiki/Q5422192","display_name":"Extractor","level":2,"score":0.28940001130104065},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.288100004196167},{"id":"https://openalex.org/C198942812","wikidata":"https://www.wikidata.org/wiki/Q496618","display_name":"Semantic property","level":2,"score":0.27309998869895935},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.26660001277923584},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.26440000534057617},{"id":"https://openalex.org/C3019973339","wikidata":"https://www.wikidata.org/wiki/Q899523","display_name":"Object based","level":3,"score":0.25859999656677246},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2578999996185303},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.25270000100135803}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.24036","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24036","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.24036","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24036","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"Multimodal":[1],"Large":[2],"Language":[3],"Models":[4],"(MLLMs)":[5],"have":[6],"enhanced":[7],"grounding":[8,36,135],"capabilities":[9],"in":[10,15,137],"general":[11],"scenes,":[12],"their":[13],"robustness":[14],"crowded":[16,138],"scenes":[17,21],"remains":[18],"underexplored.":[19],"Crowded":[20],"entail":[22],"visual":[23,88,116],"challenges":[24],"(i.e.,":[25],"occlusion":[26],"and":[27,34,47,124],"small":[28],"objects),":[29],"which":[30],"impair":[31],"object":[32,49,120],"semantics":[33],"degrade":[35],"performance.":[37],"In":[38,51],"contrast,":[39],"language":[40],"expressions":[41],"are":[42,111],"immune":[43],"to":[44,80,102,118],"such":[45,63],"degradation":[46],"preserve":[48],"semantics.":[50,121],"light":[52],"of":[53,84,90],"these":[54,96],"observations,":[55],"we":[56],"propose":[57],"a":[58,75],"novel":[59],"method":[60],"that":[61,127],"overcomes":[62],"constraints":[64],"by":[65],"leveraging":[66],"Language-Guided":[67],"Semantic":[68,76],"Cues":[69],"(LGSCs).":[70],"Specifically,":[71],"our":[72],"approach":[73],"introduces":[74],"Cue":[77],"Extractor":[78],"(SCE)":[79],"derive":[81],"semantic":[82,107],"cues":[83,97],"objects":[85],"from":[86],"the":[87,114],"pipeline":[89,117],"an":[91,131],"MLLM.":[92],"We":[93],"then":[94],"guide":[95],"using":[98],"corresponding":[99],"text":[100],"embeddings":[101],"produce":[103],"LGSCs":[104,129],"as":[105],"linguistic":[106],"priors.":[108],"Subsequently,":[109],"they":[110],"reintegrated":[112],"into":[113,130],"original":[115],"refine":[119],"Extensive":[122],"experiments":[123],"analyses":[125],"demonstrate":[126],"incorporating":[128],"MLLM":[132],"effectively":[133],"improves":[134],"accuracy":[136],"scenes.":[139]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-29T00:00:00"}
