{"id":"https://openalex.org/W4400612084","doi":"https://doi.org/10.48550/arxiv.2407.08699","title":"Mitigating Catastrophic Forgetting in Language Transfer via Model Merging","display_name":"Mitigating Catastrophic Forgetting in Language Transfer via Model Merging","publication_year":2024,"publication_date":"2024-07-11","ids":{"openalex":"https://openalex.org/W4400612084","doi":"https://doi.org/10.48550/arxiv.2407.08699"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2407.08699","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.08699","pdf_url":"https://arxiv.org/pdf/2407.08699","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2407.08699","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101939785","display_name":"Anton Alexandrov","orcid":"https://orcid.org/0009-0003-8147-7125"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Alexandrov, Anton","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051227985","display_name":"Veselin Raychev","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Raychev, Veselin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063192710","display_name":"Mark Niklas M\u00fcller","orcid":"https://orcid.org/0000-0002-2496-6542"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"M\u00fcller, Mark Niklas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100383735","display_name":"Ce Zhang","orcid":"https://orcid.org/0000-0003-1284-7279"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ce","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069901599","display_name":"Martin Vechev","orcid":"https://orcid.org/0000-0002-0054-9568"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vechev, Martin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5053947885","display_name":"Kristina Toutanova","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Toutanova, Kristina","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5101939785"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9958999752998352,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9958999752998352,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/forgetting","display_name":"Forgetting","score":0.7899576425552368},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5029491782188416},{"id":"https://openalex.org/keywords/transfer","display_name":"Transfer (computing)","score":0.4982430934906006},{"id":"https://openalex.org/keywords/cognitive-psychology","display_name":"Cognitive psychology","score":0.24332624673843384},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.2200911045074463},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.07054638862609863}],"concepts":[{"id":"https://openalex.org/C7149132","wikidata":"https://www.wikidata.org/wiki/Q1377840","display_name":"Forgetting","level":2,"score":0.7899576425552368},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5029491782188416},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.4982430934906006},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.24332624673843384},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2200911045074463},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.07054638862609863}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2407.08699","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.08699","pdf_url":"https://arxiv.org/pdf/2407.08699","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},{"id":"doi:10.48550/arxiv.2407.08699","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2407.08699","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2407.08699","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.08699","pdf_url":"https://arxiv.org/pdf/2407.08699","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4400612084.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W4289718052","https://openalex.org/W2164121020","https://openalex.org/W2145559838","https://openalex.org/W2905319430","https://openalex.org/W3116498279","https://openalex.org/W4287549553","https://openalex.org/W4310285384","https://openalex.org/W3183027292"],"abstract_inverted_index":{"As":[0],"open-weight":[1],"large":[2],"language":[3,30],"models":[4,24],"(LLMs)":[5],"achieve":[6],"ever":[7],"more":[8],"impressive":[9],"performances":[10],"across":[11,141],"a":[12,59,71],"wide":[13],"range":[14],"of":[15,38,47,73,96],"tasks":[16],"in":[17,109],"English,":[18],"practitioners":[19],"aim":[20],"to":[21,25,133],"adapt":[22],"these":[23],"different":[26,142],"languages.":[27],"However,":[28],"such":[29],"adaptation":[31,61],"is":[32,79],"often":[33],"accompanied":[34],"by":[35,55],"catastrophic":[36],"forgetting":[37,95,123],"the":[39,45,48,74,82,97,104],"base":[40],"model's":[41],"capabilities,":[42],"severely":[43],"limiting":[44],"usefulness":[46],"resulting":[49],"model.":[50],"We":[51,107],"address":[52],"this":[53,85],"issue":[54],"proposing":[56],"Branch-and-Merge":[57],"(BaM),":[58],"new":[60],"method":[62],"based":[63,80],"on":[64,70,81,103,114],"iteratively":[65],"merging":[66],"multiple":[67],"models,":[68],"fine-tuned":[69],"subset":[72],"available":[75],"training":[76],"data.":[77],"BaM":[78,119],"insight":[83],"that":[84,118],"yields":[86],"lower":[87],"magnitude":[88],"but":[89],"higher":[90],"quality":[91],"weight":[92],"changes,":[93],"reducing":[94],"source":[98],"domain":[99,130],"while":[100,124],"maintaining":[101],"learning":[102],"target":[105,129],"domain.":[106],"demonstrate":[108],"an":[110],"extensive":[111],"empirical":[112],"study":[113],"Bulgarian":[115],"and":[116,138],"German":[117],"can":[120],"significantly":[121],"reduce":[122],"matching":[125],"or":[126],"even":[127],"improving":[128],"performance":[131],"compared":[132],"both":[134],"standard":[135],"continued":[136],"pretraining":[137],"instruction":[139],"finetuning":[140],"model":[143],"architectures.":[144]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
