{"id":"https://openalex.org/W4402716166","doi":"https://doi.org/10.1109/cvpr52733.2024.01227","title":"ViP-LLaVA: Making Large Multimodal Models Understand Arbitrary Visual Prompts","display_name":"ViP-LLaVA: Making Large Multimodal Models Understand Arbitrary Visual Prompts","publication_year":2024,"publication_date":"2024-06-16","ids":{"openalex":"https://openalex.org/W4402716166","doi":"https://doi.org/10.1109/cvpr52733.2024.01227"},"language":"en","primary_location":{"id":"doi:10.1109/cvpr52733.2024.01227","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52733.2024.01227","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5081591501","display_name":"Mu Cai","orcid":"https://orcid.org/0009-0008-7967-9752"},"institutions":[{"id":"https://openalex.org/I135310074","display_name":"University of Wisconsin\u2013Madison","ror":"https://ror.org/01y2jtd41","country_code":"US","type":"education","lineage":["https://openalex.org/I135310074"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Mu Cai","raw_affiliation_strings":["University of Wisconsin-Madison"],"affiliations":[{"raw_affiliation_string":"University of Wisconsin-Madison","institution_ids":["https://openalex.org/I135310074"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100448802","display_name":"Haotian Liu","orcid":"https://orcid.org/0000-0003-2872-4740"},"institutions":[{"id":"https://openalex.org/I135310074","display_name":"University of Wisconsin\u2013Madison","ror":"https://ror.org/01y2jtd41","country_code":"US","type":"education","lineage":["https://openalex.org/I135310074"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Haotian Liu","raw_affiliation_strings":["University of Wisconsin-Madison"],"affiliations":[{"raw_affiliation_string":"University of Wisconsin-Madison","institution_ids":["https://openalex.org/I135310074"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021827101","display_name":"Siva Karthik Mustikovela","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Siva Karthik Mustikovela","raw_affiliation_strings":["Cruise LLC"],"affiliations":[{"raw_affiliation_string":"Cruise LLC","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067267836","display_name":"Gregory P. Meyer","orcid":"https://orcid.org/0000-0001-9444-5577"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gregory P. Meyer","raw_affiliation_strings":["Cruise LLC"],"affiliations":[{"raw_affiliation_string":"Cruise LLC","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079961895","display_name":"Yuning Chai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuning Chai","raw_affiliation_strings":["Cruise LLC"],"affiliations":[{"raw_affiliation_string":"Cruise LLC","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077824676","display_name":"Dennis Park","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dennis Park","raw_affiliation_strings":["Cruise LLC"],"affiliations":[{"raw_affiliation_string":"Cruise LLC","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5072079475","display_name":"Yong Jae Lee","orcid":"https://orcid.org/0000-0001-9863-1270"},"institutions":[{"id":"https://openalex.org/I135310074","display_name":"University of Wisconsin\u2013Madison","ror":"https://ror.org/01y2jtd41","country_code":"US","type":"education","lineage":["https://openalex.org/I135310074"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yong Jae Lee","raw_affiliation_strings":["University of Wisconsin-Madison"],"affiliations":[{"raw_affiliation_string":"University of Wisconsin-Madison","institution_ids":["https://openalex.org/I135310074"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5081591501"],"corresponding_institution_ids":["https://openalex.org/I135310074"],"apc_list":null,"apc_paid":null,"fwci":11.0217,"has_fulltext":false,"cited_by_count":42,"citation_normalized_percentile":{"value":0.99073417,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"12914","last_page":"12923"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9936000108718872,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6638783812522888},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.5325464606285095},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.4167737066745758},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4153191149234772},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.39251670241355896},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.36102062463760376}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6638783812522888},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5325464606285095},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.4167737066745758},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4153191149234772},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.39251670241355896},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36102062463760376}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cvpr52733.2024.01227","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52733.2024.01227","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1773149199","https://openalex.org/W1956340063","https://openalex.org/W2034966370","https://openalex.org/W2277195237","https://openalex.org/W2489434015","https://openalex.org/W2962749469","https://openalex.org/W2963115613","https://openalex.org/W2969876226","https://openalex.org/W2990138404","https://openalex.org/W2998356391","https://openalex.org/W3034727271","https://openalex.org/W3035688398","https://openalex.org/W3173909648","https://openalex.org/W4236965008","https://openalex.org/W4287586989","https://openalex.org/W4312960937","https://openalex.org/W4322718191","https://openalex.org/W4365606129","https://openalex.org/W4366330503","https://openalex.org/W4366850747","https://openalex.org/W4376312115","https://openalex.org/W4382490555","https://openalex.org/W4384268538","https://openalex.org/W4385644400","https://openalex.org/W4385645323","https://openalex.org/W4386185600","https://openalex.org/W4387294588","https://openalex.org/W4387436689","https://openalex.org/W4387596621","https://openalex.org/W4388482320","https://openalex.org/W4390873481","https://openalex.org/W4401024018","https://openalex.org/W6730250122","https://openalex.org/W6766904570","https://openalex.org/W6779473860","https://openalex.org/W6791353385","https://openalex.org/W6851607685"],"related_works":["https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2772917594","https://openalex.org/W2775347418","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"While":[0],"existing":[1],"large":[2],"vision-language":[3],"multimodal":[4,47],"models":[5,124],"focus":[6],"on":[7,101],"whole":[8],"image":[9],"understanding,":[10],"there":[11],"is":[12],"a":[13,33,45,72,116],"prominent":[14],"gap":[15],"in":[16,125,135],"achieving":[17],"region-specific":[18],"comprehension.":[19],"Current":[20],"approaches":[21],"that":[22],"use":[23],"textual":[24],"coordinates":[25],"or":[26,76],"spatial":[27],"encodings":[28],"often":[29],"fail":[30],"to":[31,59,119],"provide":[32],"user-friendly":[34],"interface":[35],"for":[36,93],"visual":[37,54,84,127],"prompting.":[38],"To":[39],"address":[40],"this":[41,136],"challenge,":[42],"we":[43,113],"introduce":[44],"novel":[46],"model":[48,67,141],"capable":[49],"of":[50,123],"decoding":[51],"arbitrary":[52],"(free-form)":[53],"prompts.":[55],"This":[56],"allows":[57],"users":[58],"intuitively":[60],"mark":[61],"images":[62],"and":[63,107,140],"interact":[64],"with":[65],"the":[66,87,91,121],"using":[68],"natural":[69],"cues":[70],"like":[71,104],"\u201cred":[73],"bounding":[74],"box\u201d":[75],"\u201cpointed":[77],"arrow\u201d.":[78],"Our":[79],"simple":[80],"design":[81],"directly":[82],"overlays":[83],"markers":[85],"onto":[86],"RGB":[88],"image,":[89],"eliminating":[90],"need":[92],"complex":[94],"region":[95],"encodings,":[96],"yet":[97],"achieves":[98],"state-of-the-art":[99],"performance":[100],"region-understanding":[102],"tasks":[103],"Visual7W,":[105],"PointQA,":[106],"Visual":[108],"Commonsense":[109],"Reasoning":[110],"benchmark.":[111],"Furthermore,":[112],"present":[114],"ViP-Bench,":[115],"comprehensive":[117],"benchmark":[118],"assess":[120],"capability":[122],"understanding":[126],"prompts":[128],"across":[129],"multiple":[130],"dimensions,":[131],"enabling":[132],"future":[133],"research":[134],"domain.":[137],"Code,":[138],"data,":[139],"are":[142],"publicly":[143],"available.":[144]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":29},{"year":2024,"cited_by_count":11}],"updated_date":"2025-12-21T01:58:51.020947","created_date":"2025-10-10T00:00:00"}
