{"id":"https://openalex.org/W7161256997","doi":"https://doi.org/10.48550/arxiv.2605.14366","title":"Reinforcement Learning with Semantic Rewards Enables Low-Resource Language Expansion without Alignment Tax","display_name":"Reinforcement Learning with Semantic Rewards Enables Low-Resource Language Expansion without Alignment Tax","publication_year":2026,"publication_date":"2026-05-14","ids":{"openalex":"https://openalex.org/W7161256997","doi":"https://doi.org/10.48550/arxiv.2605.14366"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.14366","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.14366","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.14366","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5121319865","display_name":"Zeli Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Zeli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136190521","display_name":"Ziyin Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ziyin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136220022","display_name":"Zhou Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136214581","display_name":"Xuexian Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Xuexian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136219478","display_name":"Zhankai Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Zhankai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008993283","display_name":"Longfei Zheng","orcid":"https://orcid.org/0000-0003-3604-2598"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Longfei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136191994","display_name":"Xiaolu Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xiaolu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136254399","display_name":"Rong Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Rong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136240902","display_name":"Guixian Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Guixian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136191875","display_name":"Wentao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Wentao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.21649999916553497,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.21649999916553497,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.16949999332427979,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.06800000369548798,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6402000188827515},{"id":"https://openalex.org/keywords/headline","display_name":"Headline","score":0.5426999926567078},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.49720001220703125},{"id":"https://openalex.org/keywords/forgetting","display_name":"Forgetting","score":0.42669999599456787},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.41499999165534973},{"id":"https://openalex.org/keywords/safer","display_name":"SAFER","score":0.38119998574256897},{"id":"https://openalex.org/keywords/semantic-similarity","display_name":"Semantic similarity","score":0.37279999256134033},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3716999888420105}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.753000020980835},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6402000188827515},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5910999774932861},{"id":"https://openalex.org/C2778689934","wikidata":"https://www.wikidata.org/wiki/Q1313396","display_name":"Headline","level":2,"score":0.5426999926567078},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5213000178337097},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.49720001220703125},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.42820000648498535},{"id":"https://openalex.org/C7149132","wikidata":"https://www.wikidata.org/wiki/Q1377840","display_name":"Forgetting","level":2,"score":0.42669999599456787},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.41499999165534973},{"id":"https://openalex.org/C2776654903","wikidata":"https://www.wikidata.org/wiki/Q2601463","display_name":"SAFER","level":2,"score":0.38119998574256897},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.37279999256134033},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3716999888420105},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.3280999958515167},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3231000006198883},{"id":"https://openalex.org/C197914299","wikidata":"https://www.wikidata.org/wiki/Q18650","display_name":"Semantic memory","level":3,"score":0.2948000133037567},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.289000004529953},{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.2782000005245209},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.27720001339912415},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.25839999318122864},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C2780922921","wikidata":"https://www.wikidata.org/wiki/Q255189","display_name":"Paraphrase","level":2,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.14366","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.14366","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.14366","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.14366","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Extending":[0],"large":[1],"language":[2,17,185],"models":[3],"(LLMs)":[4],"to":[5],"low-resource":[6,118,184],"languages":[7],"often":[8],"incurs":[9],"an":[10],"\"alignment":[11],"tax\":":[12],"improvements":[13],"in":[14,25,146],"the":[15,20,35,70],"target":[16],"come":[18],"at":[19],"cost":[21],"of":[22,37],"catastrophic":[23],"forgetting":[24],"general":[26,126],"capabilities.":[27],"We":[28,100],"argue":[29],"that":[30,93,114,154,169],"this":[31,54],"trade-off":[32],"arises":[33],"from":[34],"rigidity":[36],"supervised":[38],"fine-tuning":[39],"(SFT),":[40],"which":[41],"enforces":[42],"token-level":[43],"surface":[44,136],"imitation":[45],"on":[46,104],"narrow":[47],"and":[48,108,144,149,159,178],"biased":[49],"data":[50],"distributions.":[51],"To":[52],"address":[53],"limitation,":[55],"we":[56],"propose":[57],"a":[58,176],"semantic-space":[59],"alignment":[60,123],"paradigm":[61],"powered":[62],"by":[63],"Group":[64],"Relative":[65],"Policy":[66],"Optimization":[67],"(GRPO),":[68],"where":[69],"model":[71],"is":[72],"optimized":[73],"using":[74],"embedding-level":[75],"semantic":[76,138,142,173],"rewards":[77,174],"rather":[78],"than":[79,130],"likelihood":[80],"maximization.":[81],"This":[82],"objective":[83],"encourages":[84],"meaning":[85],"preservation":[86],"through":[87],"flexible":[88],"realizations,":[89],"enabling":[90],"controlled":[91],"updates":[92],"reduce":[94],"destructive":[95],"interference":[96],"with":[97,172],"pretrained":[98],"knowledge.":[99],"evaluate":[101],"our":[102,115,166],"approach":[103],"Tibetan-Chinese":[105],"machine":[106],"translation":[107],"Tibetan":[109],"headline":[110],"generation.":[111],"Experiments":[112],"show":[113],"method":[116],"acquires":[117],"capabilities":[119],"while":[120],"markedly":[121],"mitigating":[122],"tax,":[124],"preserving":[125],"competence":[127],"more":[128,157,179],"effectively":[129],"SFT.":[131],"Despite":[132],"producing":[133],"less":[134],"rigid":[135],"overlap,":[137],"RL":[139],"yields":[140],"higher":[141],"quality":[143],"preference":[145],"open-ended":[147],"generation,":[148],"few-shot":[150],"transfer":[151],"results":[152],"indicate":[153],"it":[155],"learns":[156],"transferable":[158],"robust":[160],"representations":[161],"under":[162],"limited":[163],"supervision.":[164],"Overall,":[165],"study":[167],"demonstrates":[168],"reinforcement":[170],"learning":[171],"provides":[175],"safer":[177],"reliable":[180],"pathway":[181],"for":[182],"inclusive":[183],"expansion.":[186]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-16T00:00:00"}
