{"id":"https://openalex.org/W4403381960","doi":"https://doi.org/10.48550/arxiv.2410.08209","title":"Emergent Visual Grounding in Large Multimodal Models Without Grounding Supervision","display_name":"Emergent Visual Grounding in Large Multimodal Models Without Grounding Supervision","publication_year":2024,"publication_date":"2024-10-10","ids":{"openalex":"https://openalex.org/W4403381960","doi":"https://doi.org/10.48550/arxiv.2410.08209"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2410.08209","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.08209","pdf_url":"https://arxiv.org/pdf/2410.08209","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2410.08209","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Cao, Shengcao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Cao, Shengcao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112987393","display_name":"L. Gui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gui, Liang-Yan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5102952938","display_name":"Yu-Xiong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yu-Xiong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13018","display_name":"Seismology and Earthquake Studies","score":0.7669000029563904,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13018","display_name":"Seismology and Earthquake Studies","score":0.7669000029563904,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.8869079351425171},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4512863755226135},{"id":"https://openalex.org/keywords/pixel","display_name":"Pixel","score":0.42207956314086914},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.2929248809814453},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2524740695953369},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.2346077859401703}],"concepts":[{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.8869079351425171},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4512863755226135},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.42207956314086914},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.2929248809814453},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2524740695953369},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.2346077859401703}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2410.08209","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.08209","pdf_url":"https://arxiv.org/pdf/2410.08209","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2410.08209","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2410.08209","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2410.08209","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.08209","pdf_url":"https://arxiv.org/pdf/2410.08209","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2926720356","display_name":null,"funder_award_id":"32799","funder_id":"https://openalex.org/F4320332299","funder_display_name":"National Institute of Food and Agriculture"},{"id":"https://openalex.org/G539587745","display_name":null,"funder_award_id":"2106825","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G5620962805","display_name":null,"funder_award_id":"67021","funder_id":"https://openalex.org/F4320332299","funder_display_name":"National Institute of Food and Agriculture"},{"id":"https://openalex.org/G6770826516","display_name":null,"funder_award_id":"2020-67021-32799","funder_id":"https://openalex.org/F4320332299","funder_display_name":"National Institute of Food and Agriculture"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320310598","display_name":"Amazon Web Services","ror":"https://ror.org/04mv4n011"},{"id":"https://openalex.org/F4320312143","display_name":"National Centre for Supercomputing Applications","ror":"https://ror.org/03r10zj06"},{"id":"https://openalex.org/F4320315934","display_name":"Toyota Research Institute","ror":null},{"id":"https://openalex.org/F4320332299","display_name":"National Institute of Food and Agriculture","ror":"https://ror.org/05qx3fv49"},{"id":"https://openalex.org/F4320337377","display_name":"Office of Advanced Cyberinfrastructure","ror":"https://ror.org/04nh1dc89"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4403381960.pdf","grobid_xml":"https://content.openalex.org/works/W4403381960.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2021787609","https://openalex.org/W2390279801","https://openalex.org/W1537063595","https://openalex.org/W2097328689","https://openalex.org/W2358668433","https://openalex.org/W4234899305","https://openalex.org/W4396701345","https://openalex.org/W2379604501"],"abstract_inverted_index":{"Current":[0],"large":[1],"multimodal":[2],"models":[3],"(LMMs)":[4],"face":[5],"challenges":[6],"in":[7,39,42],"grounding,":[8,53],"which":[9,59],"requires":[10],"the":[11,22,35,73,89,97,105,159],"model":[12,162],"to":[13,17,21,66,71,88],"relate":[14],"language":[15],"components":[16],"visual":[18,84,92,130],"entities.":[19],"Contrary":[20],"common":[23],"practice":[24],"that":[25,34],"fine-tunes":[26],"LMMs":[27,43,65,137],"with":[28,96,135],"additional":[29],"grounding":[30,36,47,74,136,147,156],"supervision,":[31,157],"we":[32,54,76,143],"find":[33],"ability":[37],"can":[38],"fact":[40],"emerge":[41],"trained":[44,95],"without":[45,154],"explicit":[46],"supervision.":[48,100],"To":[49],"reveal":[50],"this":[51],"emerging":[52],"introduce":[55],"an":[56,79],"\"attend-and-segment\"":[57],"method":[58],"leverages":[60],"attention":[61],"maps":[62],"from":[63],"standard":[64,90],"perform":[67],"pixel-level":[68],"segmentation.":[69],"Furthermore,":[70],"enhance":[72],"ability,":[75],"propose":[77],"DIFFLMM,":[78],"LMM":[80],"utilizing":[81],"a":[82,145],"diffusion-based":[83],"encoder,":[85,93],"as":[86],"opposed":[87],"CLIP":[91],"and":[94,107,119,128,138],"same":[98],"weak":[99],"Without":[101],"being":[102],"constrained":[103],"by":[104],"biases":[106],"limited":[108],"scale":[109],"of":[110],"grounding-specific":[111,127],"supervision":[112],"data,":[113],"our":[114],"approach":[115],"is":[116],"more":[117],"generalizable":[118],"scalable.":[120],"We":[121],"achieve":[122,144],"competitive":[123],"performance":[124],"on":[125,150],"both":[126],"general":[129],"question":[131],"answering":[132],"benchmarks,":[133],"compared":[134],"generalist":[139],"LMMs,":[140],"respectively.":[141],"Notably,":[142],"44.2":[146],"mask":[148],"recall":[149],"grounded":[151],"conversation":[152],"generation":[153],"any":[155],"outperforming":[158],"extensively":[160],"supervised":[161],"GLaMM.":[163],"Project":[164],"page:":[165],"https://GroundLMM-ICCV.github.io.":[166]},"counts_by_year":[],"updated_date":"2026-05-09T13:55:54.758798","created_date":"2024-10-14T00:00:00"}
