{"id":"https://openalex.org/W4407209004","doi":"https://doi.org/10.48550/arxiv.2502.02740","title":"Vision-Language Model Dialog Games for Self-Improvement","display_name":"Vision-Language Model Dialog Games for Self-Improvement","publication_year":2025,"publication_date":"2025-02-04","ids":{"openalex":"https://openalex.org/W4407209004","doi":"https://doi.org/10.48550/arxiv.2502.02740"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2502.02740","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.02740","pdf_url":"https://arxiv.org/pdf/2502.02740","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2502.02740","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024779340","display_name":"Ksenia Konyushkova","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Konyushkova, Ksenia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065295907","display_name":"Christos Kaplanis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaplanis, Christos","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007394281","display_name":"Serkan Cabi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cabi, Serkan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5008260665","display_name":"Misha Denil","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Denil, Misha","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.842199981212616,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.842199981212616,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.836899995803833,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.7508000135421753,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dialog-box","display_name":"Dialog box","score":0.884626030921936},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6448110938072205},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4586820900440216},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4553632140159607},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.399976909160614},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.3204527795314789},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.15439754724502563}],"concepts":[{"id":"https://openalex.org/C173853756","wikidata":"https://www.wikidata.org/wiki/Q86915","display_name":"Dialog box","level":2,"score":0.884626030921936},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6448110938072205},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4586820900440216},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4553632140159607},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.399976909160614},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.3204527795314789},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.15439754724502563},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2502.02740","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.02740","pdf_url":"https://arxiv.org/pdf/2502.02740","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2502.02740","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2502.02740","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2502.02740","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.02740","pdf_url":"https://arxiv.org/pdf/2502.02740","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2098987383","https://openalex.org/W2417260800","https://openalex.org/W1596203174","https://openalex.org/W2117933979","https://openalex.org/W2283130723","https://openalex.org/W103938586","https://openalex.org/W2104718772","https://openalex.org/W4233992201","https://openalex.org/W337810568","https://openalex.org/W3204019825"],"abstract_inverted_index":{"The":[0],"increasing":[1],"demand":[2],"for":[3,29,49,106],"high-quality,":[4],"diverse":[5],"training":[6],"data":[7,71,121],"poses":[8],"a":[9,23,40,56],"significant":[10],"bottleneck":[11],"in":[12,39,87,112],"advancing":[13],"vision-language":[14],"models":[15],"(VLMs).":[16],"This":[17,101],"paper":[18],"presents":[19],"VLM":[20],"Dialog":[21],"Games,":[22],"novel":[24],"and":[25,62,79],"scalable":[26],"self-improvement":[27],"framework":[28],"VLMs.":[30],"Our":[31],"approach":[32],"leverages":[33],"self-play":[34],"between":[35],"two":[36],"agents":[37],"engaged":[38],"goal-oriented":[41],"play":[42],"centered":[43],"around":[44],"image":[45],"identification.":[46],"By":[47],"filtering":[48],"successful":[50],"game":[51,93],"interactions,":[52],"we":[53],"automatically":[54],"curate":[55],"high-quality":[57,119],"dataset":[58],"of":[59],"interleaved":[60],"images":[61],"text.":[63],"We":[64],"demonstrate":[65],"that":[66],"fine-tuning":[67],"on":[68,76],"this":[69,95],"synthetic":[70],"leads":[72],"to":[73,91],"performance":[74],"gains":[75],"downstream":[77],"tasks":[78],"generalises":[80],"across":[81],"datasets.":[82],"Moreover,":[83],"as":[84],"the":[85,88,104,118],"improvements":[86],"model":[89],"lead":[90],"better":[92],"play,":[94],"procedure":[96],"can":[97],"be":[98],"applied":[99],"iteratively.":[100],"work":[102],"paves":[103],"way":[105],"self-improving":[107],"VLMs,":[108],"with":[109],"potential":[110],"applications":[111],"various":[113],"real-world":[114],"scenarios":[115],"especially":[116],"when":[117],"multimodal":[120],"is":[122],"scarce.":[123]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
