{"id":"https://openalex.org/W7157223345","doi":"https://doi.org/10.48550/arxiv.2604.22823","title":"PivotMerge: Bridging Heterogeneous Multimodal Pre-training via Post-Alignment Model Merging","display_name":"PivotMerge: Bridging Heterogeneous Multimodal Pre-training via Post-Alignment Model Merging","publication_year":2026,"publication_date":"2026-04-18","ids":{"openalex":"https://openalex.org/W7157223345","doi":"https://doi.org/10.48550/arxiv.2604.22823"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.22823","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22823","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.22823","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058918933","display_name":"Zibo Shao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shao, Zibo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050573340","display_name":"Baochen Xiong","orcid":"https://orcid.org/0009-0005-9799-2898"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Baochen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134757392","display_name":"Xiaoshan Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Xiaoshan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134777297","display_name":"Yaguang Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Yaguang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001453355","display_name":"Qimeng Zhang","orcid":"https://orcid.org/0000-0003-4679-7919"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Qimeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134783332","display_name":"Haifeng Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Haifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134751040","display_name":"Changsheng Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Changsheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5058918933"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7411999702453613,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7411999702453613,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.09470000118017197,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.042899999767541885,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.7282000184059143},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6086999773979187},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.5921000242233276},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5723000168800354},{"id":"https://openalex.org/keywords/core","display_name":"Core (optical fiber)","score":0.4194999933242798},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.3292999863624573},{"id":"https://openalex.org/keywords/decomposition","display_name":"Decomposition","score":0.3181999921798706},{"id":"https://openalex.org/keywords/semantic-mapping","display_name":"Semantic mapping","score":0.31630000472068787}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8109999895095825},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.7282000184059143},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6086999773979187},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.5921000242233276},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5723000168800354},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4399000108242035},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.4194999933242798},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.350600004196167},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.34619998931884766},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3379000127315521},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.3292999863624573},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.3181999921798706},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.31630000472068787},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.2935999929904938},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2930999994277954},{"id":"https://openalex.org/C2778180026","wikidata":"https://www.wikidata.org/wiki/Q18378163","display_name":"Semantic heterogeneity","level":4,"score":0.2881999909877777},{"id":"https://openalex.org/C45493050","wikidata":"https://www.wikidata.org/wiki/Q7884934","display_name":"Unified Model","level":2,"score":0.2824999988079071},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.26930001378059387},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2667999863624573},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.2574999928474426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.22823","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22823","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.22823","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22823","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Large":[1],"Language":[2],"Models":[3],"(MLLMs)":[4],"rely":[5],"on":[6,47,182,197],"multimodal":[7,101,199],"pre-training":[8,52,63],"over":[9],"diverse":[10],"data":[11,118],"sources,":[12],"where":[13,112,128],"different":[14,117,129],"datasets":[15],"often":[16],"induce":[17],"complementary":[18,34],"cross-modal":[19,68,95,136,149],"alignment":[20,96,125,163,184],"capabilities.":[21],"Model":[22],"merging":[23,43,89,146,179,191],"provides":[24],"a":[25,37,77,144],"cost-effective":[26],"mechanism":[27],"for":[28,148,193],"integrating":[29],"multiple":[30,198],"expert":[31],"MLLMs":[32],"with":[33],"strengths":[35],"into":[36,76],"unified":[38,78],"model.":[39],"However,":[40],"existing":[41,206],"model":[42],"research":[44],"mainly":[45],"focuses":[46],"post-finetuning":[48],"scenarios,":[49],"leaving":[50],"the":[51,59,87],"stage":[53],"largely":[54],"unexplored.":[55],"We":[56,186],"argue":[57],"that":[58,202],"core":[60],"of":[61],"MLLM":[62],"lies":[64],"in":[65],"establishing":[66],"effective":[67],"alignment,":[69],"which":[70,91,160,176],"bridges":[71],"visual":[72],"and":[73,123,131,158,168,172,211],"textual":[74],"representations":[75],"semantic":[79],"space.":[80],"Motivated":[81],"by":[82],"this":[83],"insight,":[84],"we":[85,141],"introduce":[86],"post-alignment":[88,145,190],"task,":[90],"aims":[92],"to":[93,135],"integrate":[94],"capabilities":[97],"learned":[98,115],"from":[99,116,165],"heterogeneous":[100],"pre-training.":[102],"This":[103],"setting":[104],"introduces":[105],"two":[106,153],"key":[107,154],"challenges:":[108],"cross-domain":[109],"parameter":[110,113],"interference,":[111],"updates":[114],"distributions":[119],"conflict":[120],"during":[121],"merging,":[122],"layer-wise":[124],"contribution":[126],"disparity,":[127],"layers":[130],"projectors":[132],"contribute":[133],"unevenly":[134],"alignment.":[137],"To":[138],"address":[139],"them,":[140],"propose":[142],"\\textbf{PivotMerge},":[143],"framework":[147],"projectors.":[150],"PivotMerge":[151,203],"incorporates":[152],"components:":[155],"Shared-space":[156],"Decomposition":[157],"Filtering,":[159],"disentangles":[161],"shared":[162],"patterns":[164],"domain-specific":[166],"variations":[167],"suppresses":[169],"conflicting":[170],"directions,":[171],"Alignment-guided":[173],"Layer-wise":[174],"Merging,":[175],"assigns":[177],"layer-specific":[178],"weights":[180],"based":[181],"differing":[183],"contributions.":[185],"construct":[187],"systematic":[188],"CC12M-based":[189],"scenarios":[192],"evaluation.":[194],"Extensive":[195],"experiments":[196],"benchmarks":[200],"show":[201],"consistently":[204],"outperforms":[205],"baselines,":[207],"demonstrating":[208],"its":[209],"effectiveness":[210],"generalization":[212],"ability.":[213]},"counts_by_year":[],"updated_date":"2026-04-29T06:16:36.941037","created_date":"2026-04-29T00:00:00"}
