{"id":"https://openalex.org/W4402754165","doi":"https://doi.org/10.1109/cvpr52733.2024.00916","title":"Visual Program Distillation: Distilling Tools and Programmatic Reasoning into Vision-Language Models","display_name":"Visual Program Distillation: Distilling Tools and Programmatic Reasoning into Vision-Language Models","publication_year":2024,"publication_date":"2024-06-16","ids":{"openalex":"https://openalex.org/W4402754165","doi":"https://doi.org/10.1109/cvpr52733.2024.00916"},"language":"en","primary_location":{"id":"doi:10.1109/cvpr52733.2024.00916","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52733.2024.00916","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069782842","display_name":"Yushi Hu","orcid":"https://orcid.org/0000-0002-7540-2413"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yushi Hu","raw_affiliation_strings":["Google Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080565982","display_name":"Otilia Stretcu","orcid":"https://orcid.org/0000-0001-7141-2916"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Otilia Stretcu","raw_affiliation_strings":["Google Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024393012","display_name":"Chun-Ta Lu","orcid":"https://orcid.org/0000-0001-8573-4975"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chun-Ta Lu","raw_affiliation_strings":["Google Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110385376","display_name":"Krishnamurthy Viswanathan","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Krishnamurthy Viswanathan","raw_affiliation_strings":["Google Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101848808","display_name":"Kenji Hata","orcid":"https://orcid.org/0000-0002-3119-582X"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kenji Hata","raw_affiliation_strings":["Google Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009947623","display_name":"Enming Luo","orcid":"https://orcid.org/0000-0002-6887-1094"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Enming Luo","raw_affiliation_strings":["Google Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032451496","display_name":"Ranjay Krishna","orcid":"https://orcid.org/0000-0001-8784-2531"},"institutions":[{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ranjay Krishna","raw_affiliation_strings":["University of Washington"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Washington","institution_ids":["https://openalex.org/I201448701"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017633018","display_name":"Ariel Fuxman","orcid":"https://orcid.org/0009-0003-6760-997X"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ariel Fuxman","raw_affiliation_strings":["Google Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5069782842"],"corresponding_institution_ids":["https://openalex.org/I1291425158"],"apc_list":null,"apc_paid":null,"fwci":5.5824,"has_fulltext":false,"cited_by_count":24,"citation_normalized_percentile":{"value":0.97011088,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"9590","last_page":"9601"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9624999761581421,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9513000249862671,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6666034460067749},{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.648552417755127},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.5014712810516357},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4181002676486969},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.3930639624595642},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.10693696141242981},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.07123208045959473}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6666034460067749},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.648552417755127},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.5014712810516357},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4181002676486969},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3930639624595642},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.10693696141242981},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.07123208045959473},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cvpr52733.2024.00916","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52733.2024.00916","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":92,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W2277195237","https://openalex.org/W2560730294","https://openalex.org/W2606722458","https://openalex.org/W2947312908","https://openalex.org/W2952122856","https://openalex.org/W2963518342","https://openalex.org/W2968124245","https://openalex.org/W2979382951","https://openalex.org/W2990138404","https://openalex.org/W3004268082","https://openalex.org/W3042080800","https://openalex.org/W3168867926","https://openalex.org/W3199693760","https://openalex.org/W4221143046","https://openalex.org/W4221166856","https://openalex.org/W4224308101","https://openalex.org/W4225323055","https://openalex.org/W4226352076","https://openalex.org/W4282919422","https://openalex.org/W4283208789","https://openalex.org/W4292779060","https://openalex.org/W4296406182","https://openalex.org/W4297816851","https://openalex.org/W4303649020","https://openalex.org/W4304195432","https://openalex.org/W4304697829","https://openalex.org/W4309953112","https://openalex.org/W4312846625","https://openalex.org/W4318718936","https://openalex.org/W4320165837","https://openalex.org/W4320458302","https://openalex.org/W4320561490","https://openalex.org/W4320722432","https://openalex.org/W4321485193","https://openalex.org/W4327526607","https://openalex.org/W4353113046","https://openalex.org/W4361866031","https://openalex.org/W4366330503","https://openalex.org/W4366566341","https://openalex.org/W4366850747","https://openalex.org/W4376312115","https://openalex.org/W4378498608","https://openalex.org/W4378506904","https://openalex.org/W4378942562","https://openalex.org/W4380993086","https://openalex.org/W4381249600","https://openalex.org/W4382490555","https://openalex.org/W4382491206","https://openalex.org/W4384918448","https://openalex.org/W4385569780","https://openalex.org/W4385570707","https://openalex.org/W4385571011","https://openalex.org/W4385572016","https://openalex.org/W4385572634","https://openalex.org/W4385573341","https://openalex.org/W4385965642","https://openalex.org/W4386065691","https://openalex.org/W4386185600","https://openalex.org/W4386655779","https://openalex.org/W4387294588","https://openalex.org/W4387561532","https://openalex.org/W4387688013","https://openalex.org/W4387723654","https://openalex.org/W4387801640","https://openalex.org/W4388482029","https://openalex.org/W4388555312","https://openalex.org/W4388716391","https://openalex.org/W4389523832","https://openalex.org/W4390872499","https://openalex.org/W4390872747","https://openalex.org/W4390873312","https://openalex.org/W4390874575","https://openalex.org/W4392637287","https://openalex.org/W4402727764","https://openalex.org/W4402754134","https://openalex.org/W6631190155","https://openalex.org/W6755935031","https://openalex.org/W6767279747","https://openalex.org/W6778883912","https://openalex.org/W6810447287","https://openalex.org/W6811467201","https://openalex.org/W6839091850","https://openalex.org/W6845478269","https://openalex.org/W6845718872","https://openalex.org/W6846254642","https://openalex.org/W6849177959","https://openalex.org/W6849257913","https://openalex.org/W6850204008","https://openalex.org/W6851592950","https://openalex.org/W6852567072","https://openalex.org/W6857614378"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W1924502949","https://openalex.org/W4246664336"],"abstract_inverted_index":{"Solving":[0],"complex":[1,106,188],"visual":[2,107],"tasks":[3,35,108],"such":[4,34],"as":[5],"\u201cWho":[6],"invented":[7],"the":[8,12,69,116,150,166],"musical":[9],"instrument":[10],"on":[11,217],"right?\u201d":[13],"involves":[14],"a":[15,37,99,110,137,146,158],"composition":[16],"of":[17,104,119,149],"skills:":[18],"understanding":[19],"space,":[20],"recognizing":[21],"instruments,":[22],"and":[23,63,84,133,174,197,213],"also":[24,205,224],"retrieving":[25],"prior":[26,182],"knowledge.":[27],"Recent":[28],"work":[29],"shows":[30],"promise":[31],"by":[32,121],"decomposing":[33],"using":[36,122],"large":[38],"language":[39,147],"model":[40,101,210],"(LLM)":[41],"into":[42,145,157],"an":[43,93],"executable":[44],"program":[45,144],"that":[46,97,163,207,221],"invokes":[47],"specialized":[48,70],"vision":[49,189],"models.":[50],"However,":[51],"generated":[52],"programs":[53],"are":[54,64,130,154],"error-prone:":[55],"they":[56,76],"omit":[57],"necessary":[58],"steps,":[59,152],"include":[60],"spurious":[61],"ones,":[62],"unable":[65],"to":[66,124,135,169,228],"recover":[67],"when":[68],"models":[71],"give":[72],"incor-rect":[73],"outputs.":[74],"Moreover,":[75],"require":[77],"loading":[78],"multiple":[79,126],"models,":[80],"incurring":[81],"high":[82],"latency":[83],"computation":[85],"costs.":[86],"We":[87],"propose":[88],"Visual":[89],"Program":[90],"Distillation":[91],"(VPD),":[92],"instruction":[94],"tuning":[95],"framework":[96],"produces":[98],"vision-language":[100],"(VLM)":[102],"ca-pable":[103],"solving":[105],"with":[109,202,231],"single":[111],"forward":[112],"pass.":[113],"VPD":[114,164,208,222],"distills":[115],"reasoning":[117,151],"ability":[118,168],"LLMs":[120],"them":[123],"sample":[125],"candidate":[127],"programs,":[128],"which":[129,153],"then":[131,155],"executed":[132],"verified":[134],"identify":[136],"correct":[138,143],"one.":[139],"It":[140],"translates":[141],"each":[142],"description":[148],"distilled":[156],"VLM.":[159],"Exten-sive":[160],"experiments":[161,216],"show":[162],"improves":[165,209],"VLM's":[167],"count,":[170],"understand":[171],"spatial":[172],"relations,":[173],"reason":[175],"compositionally.":[176],"Our":[177],"VPD-trained":[178],"PaLI-X":[179],"outperforms":[180],"all":[181],"VLMs,":[183],"achieving":[184],"state-of-the-art":[185],"performance":[186],"across":[187],"tasks,":[190],"including":[191],"MMBench,":[192],"OK-VQA,":[193],"A-OKVQA,":[194],"TallyQA,":[195],"POPE,":[196],"Hateful":[198],"Memes.":[199],"An":[200],"evaluation":[201],"human":[203],"annotators":[204],"confirms":[206],"response":[211],"factuality":[212],"consistency.":[214],"Finally,":[215],"content":[218],"moderation":[219],"demonstrate":[220],"is":[223],"helpful":[225],"for":[226],"adaptation":[227],"real-world":[229],"applications":[230],"limited":[232],"data.":[233]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":15},{"year":2024,"cited_by_count":6}],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-10-10T00:00:00"}
