{"id":"https://openalex.org/W4403570065","doi":"https://doi.org/10.48550/arxiv.2410.09733","title":"MMCOMPOSITION: Revisiting the Compositionality of Pre-trained Vision-Language Models","display_name":"MMCOMPOSITION: Revisiting the Compositionality of Pre-trained Vision-Language Models","publication_year":2024,"publication_date":"2024-10-13","ids":{"openalex":"https://openalex.org/W4403570065","doi":"https://doi.org/10.48550/arxiv.2410.09733"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2410.09733","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.09733","pdf_url":"https://arxiv.org/pdf/2410.09733","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2410.09733","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102027020","display_name":"Hang Hua","orcid":"https://orcid.org/0000-0002-5441-5776"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hua, Hang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103041876","display_name":"Yunlong Tang","orcid":"https://orcid.org/0009-0000-8119-3155"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Yunlong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039596114","display_name":"Ziyun Zeng","orcid":"https://orcid.org/0000-0002-5903-4462"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Ziyun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103187717","display_name":"Liangliang Cao","orcid":"https://orcid.org/0000-0003-0900-1512"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Liangliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009867415","display_name":"Zhengyuan Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhengyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103028641","display_name":"Hangfeng He","orcid":"https://orcid.org/0000-0001-5136-1218"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Hangfeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064805926","display_name":"Chenliang Xu","orcid":"https://orcid.org/0000-0002-2183-822X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Chenliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5055469774","display_name":"Jiebo Luo","orcid":"https://orcid.org/0000-0002-4516-9729"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Jiebo","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5102027020"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13277","display_name":"Media, Religion, Digital Communication","score":0.7724999785423279,"subfield":{"id":"https://openalex.org/subfields/1211","display_name":"Philosophy"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13277","display_name":"Media, Religion, Digital Communication","score":0.7724999785423279,"subfield":{"id":"https://openalex.org/subfields/1211","display_name":"Philosophy"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/principle-of-compositionality","display_name":"Principle of compositionality","score":0.9435705542564392},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4914301633834839},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.4578271508216858},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4577639698982239},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.42329296469688416},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.35244834423065186},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.33178332448005676},{"id":"https://openalex.org/keywords/cognitive-psychology","display_name":"Cognitive psychology","score":0.3282385468482971},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.15388983488082886}],"concepts":[{"id":"https://openalex.org/C121375916","wikidata":"https://www.wikidata.org/wiki/Q936559","display_name":"Principle of compositionality","level":2,"score":0.9435705542564392},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4914301633834839},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.4578271508216858},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4577639698982239},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42329296469688416},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.35244834423065186},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.33178332448005676},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3282385468482971},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.15388983488082886}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2410.09733","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.09733","pdf_url":"https://arxiv.org/pdf/2410.09733","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},{"id":"doi:10.48550/arxiv.2410.09733","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2410.09733","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2410.09733","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.09733","pdf_url":"https://arxiv.org/pdf/2410.09733","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4251923961","https://openalex.org/W1526190050","https://openalex.org/W2477397717","https://openalex.org/W2500457737","https://openalex.org/W2010490241","https://openalex.org/W4287025733","https://openalex.org/W4317425742","https://openalex.org/W3192500523","https://openalex.org/W4241382148","https://openalex.org/W2097744609"],"abstract_inverted_index":{"The":[0],"advent":[1],"of":[2,18,46,58,76,146,173],"large":[3],"Vision-Language":[4],"Models":[5],"(VLMs)":[6],"has":[7],"significantly":[8],"advanced":[9],"multimodal":[10],"understanding,":[11],"enabling":[12],"more":[13],"sophisticated":[14],"and":[15,20,28,34,54,61,79,89,102,121,142,161,179,181,190],"accurate":[16],"integration":[17],"visual":[19,31,60],"textual":[21,62],"information":[22],"across":[23,104],"various":[24],"tasks,":[25],"including":[26],"image":[27],"video":[29],"captioning,":[30],"question":[32],"answering,":[33],"cross-modal":[35],"retrieval.":[36],"Despite":[37],"VLMs'":[38,124],"superior":[39],"capabilities,":[40],"researchers":[41],"lack":[42],"a":[43,68,95,115,131],"comprehensive":[44],"understanding":[45,103],"their":[47],"compositionality":[48,71,93,145,154],"--":[49],"the":[50,74,144,147,157,164,171],"ability":[51,97],"to":[52,133,156,183],"understand":[53],"produce":[55],"novel":[56,116],"combinations":[57],"known":[59],"components.":[63],"Prior":[64],"benchmarks":[65],"provide":[66],"only":[67],"relatively":[69],"rough":[70],"evaluation":[72],"from":[73],"perspectives":[75],"objects,":[77],"relations,":[78],"attributes":[80],"while":[81],"neglecting":[82],"deeper":[83],"reasoning":[84,101],"about":[85],"object":[86],"interactions,":[87],"counting,":[88],"complex":[90],"compositions.":[91],"However,":[92],"is":[94],"critical":[96],"that":[98],"facilitates":[99],"coherent":[100],"modalities":[105],"for":[106,119,185],"VLMs.":[107,149],"To":[108],"address":[109],"this":[110],"limitation,":[111],"we":[112,139,151,162],"propose":[113],"MMCOMPOSITION,":[114,138],"human-annotated":[117],"benchmark":[118,128],"comprehensively":[120],"accurately":[122],"evaluating":[123],"compositionality.":[125],"Our":[126,167],"proposed":[127],"serves":[129],"as":[130],"complement":[132],"these":[134],"earlier":[135],"works.":[136],"With":[137],"can":[140],"quantify":[141],"explore":[143],"mainstream":[148],"Surprisingly,":[150],"find":[152],"GPT-4o's":[153],"inferior":[155],"best":[158],"open-source":[159],"model,":[160],"analyze":[163],"underlying":[165],"reasons.":[166],"experimental":[168],"analysis":[169],"reveals":[170],"limitations":[172],"VLMs":[174],"in":[175,187],"fine-grained":[176],"compositional":[177],"perception":[178],"reasoning,":[180],"points":[182],"areas":[184],"improvement":[186],"VLM":[188],"design":[189],"training.":[191],"Resources":[192],"available":[193],"at:":[194],"https://hanghuacs.github.io/MMComposition/":[195]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
