{"id":"https://openalex.org/W4402982181","doi":"https://doi.org/10.1109/icme57554.2024.10687511","title":"COCO is \u201cALL\u201d You Need for Visual Instruction Fine-tuning","display_name":"COCO is \u201cALL\u201d You Need for Visual Instruction Fine-tuning","publication_year":2024,"publication_date":"2024-07-15","ids":{"openalex":"https://openalex.org/W4402982181","doi":"https://doi.org/10.1109/icme57554.2024.10687511"},"language":"en","primary_location":{"id":"doi:10.1109/icme57554.2024.10687511","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme57554.2024.10687511","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114236992","display_name":"Xiaotian Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xiaotian Han","raw_affiliation_strings":["ByteDance Inc"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079640653","display_name":"Yiqi Wang","orcid":"https://orcid.org/0009-0002-4989-3674"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yiqi Wang","raw_affiliation_strings":["ByteDance Inc"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107662289","display_name":"Bohan Zhai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bohan Zhai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107662290","display_name":"Quanzeng You","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Quanzeng You","raw_affiliation_strings":["ByteDance Inc"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5025387416","display_name":"Hongxia Yang","orcid":"https://orcid.org/0000-0002-3100-3082"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hongxia Yang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5114236992"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":6.5336,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.96559671,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13412","display_name":"Education and Technology Integration","score":0.2955000102519989,"subfield":{"id":"https://openalex.org/subfields/3304","display_name":"Education"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13412","display_name":"Education and Technology Integration","score":0.2955000102519989,"subfield":{"id":"https://openalex.org/subfields/3304","display_name":"Education"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/coco","display_name":"Coco","score":0.8514835238456726},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6223351359367371},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.32278069853782654},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.28444361686706543}],"concepts":[{"id":"https://openalex.org/C21780288","wikidata":"https://www.wikidata.org/wiki/Q5139731","display_name":"Coco","level":2,"score":0.8514835238456726},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6223351359367371},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.32278069853782654},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.28444361686706543}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme57554.2024.10687511","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme57554.2024.10687511","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":56,"referenced_works":["https://openalex.org/W1956340063","https://openalex.org/W2250384498","https://openalex.org/W2549599535","https://openalex.org/W2560730294","https://openalex.org/W2597425697","https://openalex.org/W2803686446","https://openalex.org/W2904565150","https://openalex.org/W2947312908","https://openalex.org/W2952316487","https://openalex.org/W2962749469","https://openalex.org/W2963518342","https://openalex.org/W2963622213","https://openalex.org/W2963991868","https://openalex.org/W2988326850","https://openalex.org/W2988617410","https://openalex.org/W3135279933","https://openalex.org/W4253707770","https://openalex.org/W4312846625","https://openalex.org/W4381802186","https://openalex.org/W4395091069","https://openalex.org/W6634232107","https://openalex.org/W6638145986","https://openalex.org/W6639102338","https://openalex.org/W6639432524","https://openalex.org/W6694395031","https://openalex.org/W6727260379","https://openalex.org/W6747353509","https://openalex.org/W6755935031","https://openalex.org/W6786556762","https://openalex.org/W6809646742","https://openalex.org/W6810334672","https://openalex.org/W6846007759","https://openalex.org/W6849177959","https://openalex.org/W6850071225","https://openalex.org/W6850140740","https://openalex.org/W6851513886","https://openalex.org/W6851938174","https://openalex.org/W6851950068","https://openalex.org/W6853116092","https://openalex.org/W6853264062","https://openalex.org/W6853838016","https://openalex.org/W6853875005","https://openalex.org/W6854093740","https://openalex.org/W6854347851","https://openalex.org/W6854866820","https://openalex.org/W6855297460","https://openalex.org/W6855350031","https://openalex.org/W6855685237","https://openalex.org/W6856032414","https://openalex.org/W6857162426","https://openalex.org/W6858247760","https://openalex.org/W6858268588","https://openalex.org/W6858330485","https://openalex.org/W6858900395","https://openalex.org/W6859336385","https://openalex.org/W6859543883"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2360747338","https://openalex.org/W2793156604","https://openalex.org/W4394823007","https://openalex.org/W2971820882","https://openalex.org/W3186475645","https://openalex.org/W1992728043","https://openalex.org/W4383372125","https://openalex.org/W3091853197"],"abstract_inverted_index":{"Multi-modal":[0],"Large":[1],"Language":[2],"Models":[3],"(MLLMs)":[4],"are":[5,141,200],"increasingly":[6],"prominent":[7],"in":[8,125,169,245],"the":[9,36,83,147,160,181,220],"field":[10],"of":[11,82,149,183],"artificial":[12],"intelligence.":[13],"Visual":[14],"instruction":[15,32,197],"fine-tuning":[16,40],"(IFT)":[17],"is":[18,35,80,156],"a":[19,51,166,212],"vital":[20],"process":[21],"for":[22,62,68,204],"aligning":[23],"MLLMs\u2019":[24],"output":[25],"with":[26,57,98,115,136,192,216,224,234],"user\u2019s":[27],"intentions.":[28],"High-quality":[29],"and":[30,65,76,132,194,202,248],"diversified":[31],"following":[33,198],"data":[34],"key":[37],"to":[38,45,120,145,159],"this":[39,99,116,184,208],"process.":[41],"Recent":[42],"studies":[43],"propose":[44],"construct":[46,77],"visual":[47,69],"IFT":[48,91,171,214],"datasets":[49,56,92,172,191],"through":[50],"multifaceted":[52],"approach:":[53],"transforming":[54],"existing":[55],"rule-based":[58],"templates,":[59],"employing":[60],"GPT-4":[61],"rewriting":[63],"annotations,":[64],"utilizing":[66],"GPT-4V":[67],"dataset":[70,117,222],"pseudo-labeling.":[71],"LLaVA-1.5":[72],"adopted":[73],"similar":[74],"approach":[75],"LLaVA-Instruct,":[78],"which":[79],"one":[81],"simplest,":[84],"most":[85,89],"widely":[86],"used,":[87],"yet":[88],"effective":[90],"today.":[93],"Notably,":[94],"when":[95,232],"properly":[96,124],"fine-tuned":[97,233],"dataset,":[100,162,215,236],"MLLMs":[101,205,237],"can":[102],"achieve":[103,238],"state-of-the-art":[104],"performance":[105,240],"on":[106,241],"several":[107],"benchmarks.":[108],"However,":[109],"we":[110,210],"noticed":[111],"that":[112,190,231],"models":[113],"trained":[114],"often":[118],"struggle":[119],"follow":[121],"user":[122],"instructions":[123],"multi-round":[126,249],"dialog.":[127],"In":[128,207],"addition,":[129],"tradition":[130],"caption":[131],"VQA":[133,178],"evaluation":[134,139,243],"benchmarks,":[135],"their":[137],"closed-form":[138],"structure,":[140],"not":[142,157],"fully":[143],"equipped":[144],"assess":[146],"capabilities":[148],"modern":[150],"open-ended":[151,242],"generative":[152],"MLLMs.":[153],"This":[154],"problem":[155],"unique":[158],"LLaVA-Instruct":[161],"but":[163],"may":[164,186],"be":[165],"potential":[167],"issue":[168,185],"all":[170],"constructed":[173],"from":[174,219],"image":[175],"captioning":[176],"or":[177],"sources,":[179],"though":[180],"extent":[182],"vary.":[187],"We":[188],"argue":[189],"diverse":[193,226],"high-quality":[195],"detailed":[196],"annotations":[199],"essential":[201],"adequate":[203],"IFT.":[206],"work,":[209],"establish":[211],"new":[213],"images":[217],"sourced":[218],"COCO":[221],"along":[223],"more":[225],"instructions.":[227],"Our":[228],"experiments":[229],"show":[230],"proposed":[235],"better":[239],"benchmarks":[244],"both":[246],"single-round":[247],"dialog":[250],"setting.":[251]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1}],"updated_date":"2025-12-22T23:10:17.713674","created_date":"2025-10-10T00:00:00"}
