{"id":"https://openalex.org/W4415535493","doi":"https://doi.org/10.1145/3746027.3755028","title":"Modal Symbiosis: Variational Alignment Unveils New Horizons in Multimodal Representation Learning","display_name":"Modal Symbiosis: Variational Alignment Unveils New Horizons in Multimodal Representation Learning","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415535493","doi":"https://doi.org/10.1145/3746027.3755028"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755028","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755028","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113071950","display_name":"Zeyan Li","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zeyan Li","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103703049","display_name":"Cankun Guo","orcid":null},"institutions":[{"id":"https://openalex.org/I141649914","display_name":"Nanchang University","ror":"https://ror.org/042v6xz23","country_code":"CN","type":"education","lineage":["https://openalex.org/I141649914"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cankun Guo","raw_affiliation_strings":["Nanchang University, Nanchang, China"],"affiliations":[{"raw_affiliation_string":"Nanchang University, Nanchang, China","institution_ids":["https://openalex.org/I141649914"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5095956092","display_name":"Yin Tang","orcid":"https://orcid.org/0000-0001-7693-7543"},"institutions":[{"id":"https://openalex.org/I159948400","display_name":"Jinan University","ror":"https://ror.org/02xe5ns62","country_code":"CN","type":"education","lineage":["https://openalex.org/I159948400"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yin Tang","raw_affiliation_strings":["Jinan University, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"Jinan University, Guangzhou, China","institution_ids":["https://openalex.org/I159948400"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5113071950"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.29496958,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1210","last_page":"1219"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9954000115394592,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.6539999842643738},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6449999809265137},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.6444000005722046},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6190000176429749},{"id":"https://openalex.org/keywords/projection","display_name":"Projection (relational algebra)","score":0.5238000154495239},{"id":"https://openalex.org/keywords/feature-vector","display_name":"Feature vector","score":0.4779999852180481},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.4645000100135803},{"id":"https://openalex.org/keywords/mode","display_name":"Mode (computer interface)","score":0.42579999566078186},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.41130000352859497}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6657000184059143},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.6539999842643738},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6449999809265137},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.6444000005722046},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6190000176429749},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6176000237464905},{"id":"https://openalex.org/C57493831","wikidata":"https://www.wikidata.org/wiki/Q3134666","display_name":"Projection (relational algebra)","level":2,"score":0.5238000154495239},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.4779999852180481},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.4645000100135803},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.42579999566078186},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.41130000352859497},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.38449999690055847},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.36730000376701355},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3546999990940094},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3425999879837036},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.3303000032901764},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3255000114440918},{"id":"https://openalex.org/C153874254","wikidata":"https://www.wikidata.org/wiki/Q115542","display_name":"Canonical correlation","level":2,"score":0.313400000333786},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.2815000116825104},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.27869999408721924},{"id":"https://openalex.org/C2777311342","wikidata":"https://www.wikidata.org/wiki/Q578801","display_name":"Shared space","level":3,"score":0.27549999952316284},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.26249998807907104},{"id":"https://openalex.org/C2778744346","wikidata":"https://www.wikidata.org/wiki/Q1152224","display_name":"Distinctive feature","level":2,"score":0.257099986076355},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2529999911785126},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755028","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755028","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1083226793","display_name":null,"funder_award_id":"62272198, 62276277","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W1933349210","https://openalex.org/W2185175083","https://openalex.org/W2277195237","https://openalex.org/W2560647685","https://openalex.org/W2619383789","https://openalex.org/W2886641317","https://openalex.org/W2963341956","https://openalex.org/W2963588172","https://openalex.org/W2966182616","https://openalex.org/W3035333188","https://openalex.org/W3035635319","https://openalex.org/W3090449556","https://openalex.org/W3153675281","https://openalex.org/W3168433561","https://openalex.org/W4312910992","https://openalex.org/W4394606408"],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"models":[1,150],"integrate":[2],"visual,":[3],"textual,":[4],"and":[5,21,76,125,142],"other":[6,78],"data":[7],"to":[8,160],"achieve":[9],"human-like":[10],"understanding,":[11],"but":[12],"this":[13,46],"fusion":[14],"creates":[15],"a":[16,56,61,90,99,110],"conflict":[17],"between":[18],"cross-modal":[19,80,140,166],"alignment":[20],"modality-specific":[22,74,105],"expertise.The":[23],"pursuit":[24],"of":[25],"unified":[26],"feature":[27],"spaces":[28],"often":[29],"undermines":[30],"specialized":[31],"knowledge":[32],"in":[33,41,165],"individual":[34],"modalities,":[35],"as":[36],"shown":[37],"by":[38,83],"performance":[39,134],"drops":[40],"unimodal":[42,152],"tasks.":[43,153],"To":[44],"resolve":[45],"contradiction,":[47],"we":[48,88],"propose":[49],"VAMP":[50],"(Variational":[51],"Alignment":[52],"with":[53],"Modality":[54],"Preservation),":[55],"novel":[57],"multimodal":[58],"framework":[59],"featuring":[60],"Dynamic":[62],"Feature":[63],"Diversion":[64],"mechanism":[65],"that":[66,95,114],"partitions":[67],"modal":[68],"representations":[69],"into":[70,98],"two":[71],"components-one":[72],"preserving":[73,104],"expertise":[75],"the":[77,161],"enabling":[79],"alignment.":[81,167],"Inspired":[82],"Variational":[84],"Canonical":[85],"Correlation":[86],"Analysis,":[87],"introduce":[89],"shared":[91],"space":[92,102],"projection":[93],"layer":[94],"maps":[96],"features":[97],"common":[100],"representational":[101],"while":[103,146],"characteristics.":[106],"We":[107],"further":[108],"implement":[109],"Progressive":[111],"Training":[112],"Strategy":[113],"sequentially":[115],"freezes":[116],"different":[117],"components":[118],"before":[119],"full":[120],"fine-tuning,":[121],"preventing":[122],"mode":[123],"collapse":[124],"enhancing":[126],"generalization":[127],"capabilities.":[128],"Experimental":[129],"results":[130],"demonstrate":[131],"VAMP's":[132],"significant":[133],"improvements":[135],"across":[136],"zero-shot":[137],"image":[138],"classification,":[139],"retrieval,":[141],"visual":[143],"question":[144],"answering,":[145],"simultaneously":[147],"outperforming":[148],"baseline":[149],"on":[151],"This":[154],"research":[155],"provides":[156],"an":[157],"engineered":[158],"solution":[159],"''knowledge":[162],"dilution''":[163],"problem":[164]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-25T00:00:00"}
