{"id":"https://openalex.org/W7162417772","doi":"https://doi.org/10.48550/arxiv.2605.25901","title":"AgentGrounder: Zero-Shot 3D Visual Pointcloud Grounding using Multimodal Language Models","display_name":"AgentGrounder: Zero-Shot 3D Visual Pointcloud Grounding using Multimodal Language Models","publication_year":2026,"publication_date":"2026-05-25","ids":{"openalex":"https://openalex.org/W7162417772","doi":"https://doi.org/10.48550/arxiv.2605.25901"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.25901","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25901","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.25901","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137080478","display_name":"Cuong Huynh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huynh, Cuong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134884536","display_name":"Maxim Popov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Popov, Maxim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120190625","display_name":"Denis Gridusov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gridusov, Denis","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134925788","display_name":"Sergey Kolyubin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kolyubin, Sergey","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.972599983215332,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.972599983215332,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.0024999999441206455,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.002400000113993883,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5310999751091003},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5027999877929688},{"id":"https://openalex.org/keywords/rendering","display_name":"Rendering (computer graphics)","score":0.4957999885082245},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.49480000138282776},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.35269999504089355},{"id":"https://openalex.org/keywords/point-cloud","display_name":"Point cloud","score":0.3497999906539917},{"id":"https://openalex.org/keywords/bounding-overwatch","display_name":"Bounding overwatch","score":0.33219999074935913},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.31779998540878296},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.3160000145435333},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.31279999017715454}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7883999943733215},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.618399977684021},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5616999864578247},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5310999751091003},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5027999877929688},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.4957999885082245},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.49480000138282776},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.35269999504089355},{"id":"https://openalex.org/C131979681","wikidata":"https://www.wikidata.org/wiki/Q1899648","display_name":"Point cloud","level":2,"score":0.3497999906539917},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.33219999074935913},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.31779998540878296},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.3160000145435333},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.31279999017715454},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.3118000030517578},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.3089999854564667},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.30250000953674316},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.29910001158714294},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.29440000653266907},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2939999997615814},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.28619998693466187},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.28519999980926514},{"id":"https://openalex.org/C108882727","wikidata":"https://www.wikidata.org/wiki/Q2991685","display_name":"Solid modeling","level":2,"score":0.27900001406669617},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.271699994802475},{"id":"https://openalex.org/C2776863239","wikidata":"https://www.wikidata.org/wiki/Q7936601","display_name":"Visual hull","level":3,"score":0.26919999718666077},{"id":"https://openalex.org/C3019007443","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3d model","level":2,"score":0.26739999651908875},{"id":"https://openalex.org/C147037132","wikidata":"https://www.wikidata.org/wiki/Q6865426","display_name":"Minimum bounding box","level":3,"score":0.2639000117778778},{"id":"https://openalex.org/C109950114","wikidata":"https://www.wikidata.org/wiki/Q4464732","display_name":"3D reconstruction","level":2,"score":0.26190000772476196},{"id":"https://openalex.org/C92757383","wikidata":"https://www.wikidata.org/wiki/Q382497","display_name":"Affine transformation","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.25901","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25901","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.25901","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.25901","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.7426087260246277,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"3D":[0,17,54,62,75,89,103,225],"Visual":[1],"Grounding":[2],"(3DVG)":[3],"is":[4,142,229],"an":[5,84,93,108],"essential":[6],"capability":[7],"for":[8,223],"embodied":[9],"AI,":[10],"requiring":[11],"agents":[12],"to":[13,91],"localize":[14],"objects":[15],"in":[16,183],"scenes":[18],"based":[19],"on":[20,36,69,130,169,189,193,200],"natural":[21],"language":[22],"descriptions.":[23],"Recent":[24],"zero-shot":[25,61,175],"methods":[26],"leverage":[27],"2D":[28],"vision-language":[29],"models":[30],"(LVLMs).":[31],"However,":[32],"they":[33],"often":[34],"rely":[35],"existing":[37],"sets":[38],"of":[39],"multi-view":[40],"images":[41],"and":[42,48,106,126,156,171,177,191,213,220],"struggle":[43],"with":[44,98,145,164,195],"the":[45,121],"limited":[46],"semantic":[47,101],"spatial":[49],"details":[50],"provided":[51],"by":[52,160],"standard":[53],"segmentation":[55],"tools.":[56],"We":[57,167],"present":[58],"$\\textbf{AgentGrounder}$,":[59],"a":[60,80,174,196,218],"visual":[63,134,215],"grounding":[64],"framework":[65],"that":[66,87,112,207],"operates":[67],"directly":[68],"colored":[70],"point":[71],"clouds":[72],"without":[73],"task-specific":[74],"training.":[76],"Our":[77,227],"approach":[78],"follows":[79],"two-stage":[81],"design:":[82],"(1)":[83],"offline":[85],"stage":[86],"applies":[88],"model":[90],"build":[92],"Object":[94],"Lookup":[95],"Table":[96],"(OLT)":[97],"instance":[99],"IDs,":[100],"labels,":[102],"bounding":[104],"boxes;":[105],"(2)":[107],"online":[109],"tool-driven":[110],"agent":[111],"decomposes":[113],"each":[114],"query,":[115],"retrieves":[116],"only":[117],"relevant":[118],"candidates":[119],"from":[120],"OLT,":[122],"performs":[123],"geometric":[124,211],"scoring,":[125],"triggers":[127],"image":[128],"rendering":[129],"demand":[131],"when":[132],"additional":[133],"evidence":[135],"(e.g.,":[136],"color,":[137],"material,":[138],"or":[139],"viewpoint-sensitive":[140],"cues)":[141],"required.":[143],"Compared":[144],"fixed":[146],"anchor-target":[147],"matching":[148,154],"pipelines,":[149],"this":[150],"design":[151],"reduces":[152],"cascading":[153],"errors":[155],"improves":[157],"context-window":[158],"efficiency":[159],"avoiding":[161],"prompts":[162],"overloaded":[163],"irrelevant":[165],"objects.":[166],"evaluate":[168],"ScanRefer":[170,190],"Nr3D":[172,201],"under":[173],"setting":[176],"observe":[178],"consistent":[179],"improvements":[180],"over":[181],"SeeGround":[182],"our":[184],"setup,":[185],"including":[186],"+2.5%":[187],"Acc@0.5":[188],"+6.3%":[192,198],"Nr3D,":[194],"notable":[197],"gain":[199],"view-independent":[202],"queries.":[203],"These":[204],"results":[205],"show":[206],"combining":[208],"selective":[209],"retrieval,":[210],"reasoning,":[212],"adaptive":[214],"inspection":[216],"yields":[217],"practical":[219],"robust":[221],"foundation":[222],"open-vocabulary":[224],"grounding.":[226],"code":[228],"available":[230],"at":[231],"https://github.com/be2rlab/AgentGrounder.":[232]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-27T00:00:00"}
