{"id":"https://openalex.org/W4415535474","doi":"https://doi.org/10.1145/3746027.3755032","title":"Decoupled Global-Local Alignment for Improving Compositional Understanding","display_name":"Decoupled Global-Local Alignment for Improving Compositional Understanding","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415535474","doi":"https://doi.org/10.1145/3746027.3755032"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755032","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755032","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114093996","display_name":"Xiaoxing Hu","orcid":"https://orcid.org/0009-0009-2186-1276"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xiaoxing Hu","raw_affiliation_strings":["School of Information and Electronics, Beijing Institute of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Information and Electronics, Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120130915","display_name":"Kaicheng Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaicheng Yang","raw_affiliation_strings":["DeepGlint, Beijing, China"],"affiliations":[{"raw_affiliation_string":"DeepGlint, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100636886","display_name":"Jun Wang","orcid":"https://orcid.org/0000-0001-9980-1112"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jun Wang","raw_affiliation_strings":["DeepGlint, Beijing, China"],"affiliations":[{"raw_affiliation_string":"DeepGlint, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120130916","display_name":"Haoran Xu","orcid":"https://orcid.org/0000-0003-2091-6158"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoran Xu","raw_affiliation_strings":["Microsoft, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120130917","display_name":"Ziyong Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ziyong Feng","raw_affiliation_strings":["DeepGlint, Beijing, China"],"affiliations":[{"raw_affiliation_string":"DeepGlint, Beijing, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5079551552","display_name":"Yupei Wang","orcid":"https://orcid.org/0000-0002-9771-6229"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yupei Wang","raw_affiliation_strings":["School of Information and Electronic, Beijing Institute of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Information and Electronic, Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5114093996"],"corresponding_institution_ids":["https://openalex.org/I125839683"],"apc_list":null,"apc_paid":null,"fwci":2.3568,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.91566903,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"3251","last_page":"3260"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.9523000121116638,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.9523000121116638,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9435999989509583,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9077000021934509,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6276999711990356},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5698999762535095},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4781999886035919},{"id":"https://openalex.org/keywords/contrast","display_name":"Contrast (vision)","score":0.4458000063896179},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4332999885082245},{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.42899999022483826}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7421000003814697},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6276999711990356},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5698999762535095},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5008999705314636},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4781999886035919},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.4458000063896179},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4332999885082245},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.42899999022483826},{"id":"https://openalex.org/C7149132","wikidata":"https://www.wikidata.org/wiki/Q1377840","display_name":"Forgetting","level":2,"score":0.34389999508857727},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.31369999051094055},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.29910001158714294},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2858999967575073},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.28380000591278076},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.28279998898506165},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.25360000133514404}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755032","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755032","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G884765218","display_name":null,"funder_award_id":"62301046","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":1,"referenced_works":["https://openalex.org/W2533598788"],"related_works":[],"abstract_inverted_index":{"Contrastive":[0],"Language-Image":[1],"Pre-training":[2],"(CLIP)":[3],"has":[4],"achieved":[5],"success":[6],"on":[7],"multiple":[8],"downstream":[9],"tasks":[10,194],"by":[11,57],"aligning":[12,111],"image":[13],"and":[14,34,177,191],"text":[15],"modalities.":[16],"However,":[17],"the":[18,52,66,94,97,107,112,128,135,150,172,196,199],"nature":[19],"of":[20,96,130,138,154,198],"global":[21,40,108],"contrastive":[22],"learning":[23,152],"limits":[24],"CLIP's":[25],"ability":[26],"to":[27,44,159,182],"comprehend":[28],"compositional":[29,46,83,145,192],"concepts,":[30],"such":[31],"as":[32],"relations":[33],"attributes.":[35],"Although":[36],"recent":[37],"studies":[38],"employ":[39],"hard":[41],"negative":[42,61,164],"samples":[43,62],"improve":[45,144],"understanding,":[47,146],"these":[48],"methods":[49],"significantly":[50],"compromise":[51],"model's":[53,98],"inherent":[54,99],"general":[55,90,190],"capabilities":[56],"forcibly":[58],"distancing":[59],"textual":[60],"from":[63,122],"images":[64],"in":[65,89],"embedding":[67],"space.":[68],"To":[69,92,143],"overcome":[70],"this":[71],"limitation,":[72],"we":[73,101,147,170],"introduce":[74],"a":[75,103,117],"Decoupled":[76],"Global-Local":[77],"Alignment":[78],"(DeGLA)":[79],"framework":[80],"that":[81],"improves":[82],"understanding":[84],"while":[85],"substantially":[86],"mitigating":[87],"losses":[88],"capabilities.":[91],"optimize":[93],"retention":[95],"capabilities,":[100],"incorporate":[102],"self-distillation":[104],"mechanism":[105],"within":[106],"alignment":[109],"process,":[110],"learnable":[113],"image-text":[114],"encoder":[115],"with":[116],"frozen":[118],"teacher":[119],"model":[120],"derived":[121],"an":[123],"exponential":[124],"moving":[125],"average.":[126],"Under":[127],"constraint":[129],"self-distillation,":[131],"it":[132],"effectively":[133],"mitigates":[134],"catastrophic":[136],"forgetting":[137],"pretrained":[139],"knowledge":[140],"during":[141],"fine-tuning.":[142],"first":[148],"leverage":[149],"in-context":[151],"capability":[153],"Large":[155],"Language":[156],"Models":[157],"(LLMs)":[158],"construct":[160],"about":[161],"2M":[162],"high-quality":[163],"captions":[165],"across":[166,188],"five":[167],"types.":[168],"Subsequently,":[169],"propose":[171],"Image-Grounded":[173],"Contrast":[174,179],"(IGC)":[175],"loss":[176,181],"Text-Grounded":[178],"(TGC)":[180],"enhance":[183],"vision-language":[184],"compositionally.":[185],"Experimental":[186],"results":[187],"both":[189],"reasoning":[193],"validate":[195],"effectiveness":[197],"DeGLA":[200],"framework.":[201],"Our":[202],"code":[203],"is":[204],"released":[205],"at":[206],"https://github.com/xiaoxing2001/DeGLA.":[207]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-25T00:00:00"}
