{"id":"https://openalex.org/W7131115545","doi":"https://doi.org/10.1109/iccvw69036.2025.00654","title":"SCRAMBLe: Enhancing Multimodal LLM Compositionality with Synthetic Preference Data","display_name":"SCRAMBLe: Enhancing Multimodal LLM Compositionality with Synthetic Preference Data","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W7131115545","doi":"https://doi.org/10.1109/iccvw69036.2025.00654"},"language":null,"primary_location":{"id":"doi:10.1109/iccvw69036.2025.00654","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccvw69036.2025.00654","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision Workshops (ICCVW)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102893907","display_name":"Samarth Mishra","orcid":"https://orcid.org/0000-0003-3425-2647"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Samarth Mishra","raw_affiliation_strings":["Boston University"],"affiliations":[{"raw_affiliation_string":"Boston University","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126595972","display_name":"Kate Saenko","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kate Saenko","raw_affiliation_strings":["Boston University"],"affiliations":[{"raw_affiliation_string":"Boston University","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5126651045","display_name":"Venkatesh Saligrama","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Venkatesh Saligrama","raw_affiliation_strings":["Boston University"],"affiliations":[{"raw_affiliation_string":"Boston University","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5102893907"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.88567518,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"6292","last_page":"6302"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.43689998984336853,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.43689998984336853,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2248000055551529,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.022600000724196434,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/principle-of-compositionality","display_name":"Principle of compositionality","score":0.7257999777793884},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.5777999758720398},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5315999984741211},{"id":"https://openalex.org/keywords/synthetic-data","display_name":"Synthetic data","score":0.5178999900817871},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.46779999136924744},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.44369998574256897},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.40720000863075256},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3582000136375427}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.748199999332428},{"id":"https://openalex.org/C121375916","wikidata":"https://www.wikidata.org/wiki/Q936559","display_name":"Principle of compositionality","level":2,"score":0.7257999777793884},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6711999773979187},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.5777999758720398},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5315999984741211},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.5178999900817871},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.46779999136924744},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.44369998574256897},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4375999867916107},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.40720000863075256},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39739999175071716},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3582000136375427},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.3452000021934509},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.33649998903274536},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.32919999957084656},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.3188000023365021},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.27489998936653137},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2745000123977661},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2669000029563904},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.257999986410141},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.2572999894618988},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iccvw69036.2025.00654","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccvw69036.2025.00654","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision Workshops (ICCVW)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2768846366","https://openalex.org/W2962897543","https://openalex.org/W3090449556","https://openalex.org/W3105604018","https://openalex.org/W3153469116","https://openalex.org/W3173220247","https://openalex.org/W3198377975","https://openalex.org/W4312261477","https://openalex.org/W4312784228","https://openalex.org/W4312933868","https://openalex.org/W4385572962","https://openalex.org/W4385573684","https://openalex.org/W4386071700","https://openalex.org/W4386072101","https://openalex.org/W4386076015","https://openalex.org/W4389519483","https://openalex.org/W4389523929","https://openalex.org/W4390872508","https://openalex.org/W4390872526","https://openalex.org/W4390872842","https://openalex.org/W4402727624","https://openalex.org/W4402727764","https://openalex.org/W4403601051"],"related_works":[],"abstract_inverted_index":{"Compositionality,":[0],"or":[1],"correctly":[2],"recognizing":[3],"scenes":[4],"as":[5,26,155,157],"compositions":[6,33],"of":[7,21,107],"atomic":[8],"visual":[9,193],"concepts,":[10],"remains":[11],"difficult":[12],"for":[13,47,91,115,198],"multimodal":[14],"large":[15],"language":[16,152],"models":[17,71,203],"(MLLMs).":[18],"Even":[19],"state":[20],"the":[22,88],"art":[23],"MLLMs":[24,51,108,119],"such":[25,49,77],"GPT-4o":[27],"can":[28,72,144],"make":[29],"mistakes":[30],"in":[31,69,125],"distinguishing":[32],"like":[34],"\u201cdog":[35],"chasing":[36,40],"cat\u201d":[37],"vs":[38],"\u201ccat":[39],"dog\u201d.":[41],"While":[42],"on":[43,120,162,176,190],"Winoground,":[44],"a":[45,61,82,95,126,168],"benchmark":[46],"measuring":[48],"reasoning,":[50],"have":[52],"made":[53],"signif-icant":[54],"progress,":[55],"they":[56],"are":[57],"still":[58],"far":[59],"from":[60,130,178],"human's":[62],"performance.":[63],"We":[64,100],"show":[65],"that":[66],"compositional":[67,139],"reasoning":[68,140],"these":[70,137],"be":[73],"improved":[74],"by":[75,188],"elucidating":[76],"concepts":[78],"via":[79],"data,":[80],"where":[81],"model":[83,174],"is":[84,209],"trained":[85],"to":[86,180,184],"prefer":[87],"correct":[89],"caption":[90],"an":[92,113],"image":[93],"over":[94],"close":[96],"but":[97,159],"incorrect":[98],"one.":[99],"introduce":[101],"SCRAMBLe:":[102],"Synthetic":[103],"Compositional":[104],"Reasoning":[105],"Augmentation":[106],"with":[109,201],"Binary":[110],"preference":[111,116,122],"Learning,":[112],"approach":[114],"tuning":[117],"open-weight":[118],"synthetic":[121,206],"data":[123],"generated":[124],"fully":[127],"automated":[128],"manner":[129],"existing":[131],"image-caption":[132],"data.":[133],"SCRAMBLe":[134,171,199],"holistically":[135],"improves":[136,175],"MLLMs'":[138],"capabilities":[141],"which":[142],"we":[143],"see":[145],"through":[146],"significant":[147,160],"improvements":[148,161],"across":[149],"multiple":[150],"vision":[151],"composition-ality":[153],"benchmarks,":[154],"well":[156],"smaller":[158],"general":[163,192],"question":[164,194],"answering":[165,195],"tasks.":[166,196],"As":[167],"sneak":[169],"peek,":[170],"tuned":[172,202],"Molmo-7B":[173],"Winoground":[177],"49.5%":[179],"54.8%":[181],"(best":[182],"reported":[183],"date),":[185],"while":[186],"improving":[187],"1%":[189],"more":[191],"Code":[197],"along":[200],"and":[204],"our":[205],"training":[207],"dataset":[208],"available":[210],"at":[211],"https://github.com/samarth4149/SCRAMBLe.":[212]},"counts_by_year":[],"updated_date":"2026-02-25T06:17:34.324206","created_date":"2026-02-24T00:00:00"}
