{"id":"https://openalex.org/W4410515544","doi":"https://doi.org/10.3390/bdcc9050137","title":"The Development of Small-Scale Language Models for Low-Resource Languages, with a Focus on Kazakh and Direct Preference Optimization","display_name":"The Development of Small-Scale Language Models for Low-Resource Languages, with a Focus on Kazakh and Direct Preference Optimization","publication_year":2025,"publication_date":"2025-05-20","ids":{"openalex":"https://openalex.org/W4410515544","doi":"https://doi.org/10.3390/bdcc9050137"},"language":"en","primary_location":{"id":"doi:10.3390/bdcc9050137","is_oa":true,"landing_page_url":"https://doi.org/10.3390/bdcc9050137","pdf_url":"https://www.mdpi.com/2504-2289/9/5/137/pdf?version=1747738200","source":{"id":"https://openalex.org/S4210238752","display_name":"Big Data and Cognitive Computing","issn_l":"2504-2289","issn":["2504-2289"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Big Data and Cognitive Computing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.mdpi.com/2504-2289/9/5/137/pdf?version=1747738200","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067474890","display_name":"Nurgali Kadyrbek","orcid":"https://orcid.org/0000-0002-5461-8899"},"institutions":[{"id":"https://openalex.org/I185571130","display_name":"Al-Farabi Kazakh National University","ror":"https://ror.org/03q0vrn42","country_code":"KZ","type":"education","lineage":["https://openalex.org/I185571130"]}],"countries":["KZ"],"is_corresponding":true,"raw_author_name":"Nurgali Kadyrbek","raw_affiliation_strings":["Department of AI & Big Data, Faculty of Information Technologies, Al-Farabi Kazakh National University, Almaty 050040, Kazakhstan"],"affiliations":[{"raw_affiliation_string":"Department of AI & Big Data, Faculty of Information Technologies, Al-Farabi Kazakh National University, Almaty 050040, Kazakhstan","institution_ids":["https://openalex.org/I185571130"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072518144","display_name":"\u0416\u0430\u043d\u0441\u0435\u0438\u0442 \u041a\u0430\u043d\u0441\u0435\u0438\u0442\u043e\u0432\u0438\u0447 \u0422\u0443\u0439\u043c\u0435\u0431\u0430\u0435\u0432","orcid":"https://orcid.org/0000-0001-5495-1686"},"institutions":[{"id":"https://openalex.org/I185571130","display_name":"Al-Farabi Kazakh National University","ror":"https://ror.org/03q0vrn42","country_code":"KZ","type":"education","lineage":["https://openalex.org/I185571130"]}],"countries":["KZ"],"is_corresponding":false,"raw_author_name":"Zhanseit Tuimebayev","raw_affiliation_strings":["Al-Farabi Kazakh National University, Almaty 050040, Kazakhstan"],"affiliations":[{"raw_affiliation_string":"Al-Farabi Kazakh National University, Almaty 050040, Kazakhstan","institution_ids":["https://openalex.org/I185571130"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049620410","display_name":"\u041c\u0430\u0434\u0438\u043d\u0430 \u041c\u0430\u043d\u0441\u0443\u0440\u043e\u0432\u0430","orcid":"https://orcid.org/0000-0002-9680-2758"},"institutions":[{"id":"https://openalex.org/I185571130","display_name":"Al-Farabi Kazakh National University","ror":"https://ror.org/03q0vrn42","country_code":"KZ","type":"education","lineage":["https://openalex.org/I185571130"]}],"countries":["KZ"],"is_corresponding":false,"raw_author_name":"Madina Mansurova","raw_affiliation_strings":["Department of AI & Big Data, Faculty of Information Technologies, Al-Farabi Kazakh National University, Almaty 050040, Kazakhstan"],"affiliations":[{"raw_affiliation_string":"Department of AI & Big Data, Faculty of Information Technologies, Al-Farabi Kazakh National University, Almaty 050040, Kazakhstan","institution_ids":["https://openalex.org/I185571130"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017897629","display_name":"V\u00edtor V\u00edegas","orcid":"https://orcid.org/0000-0002-8929-1574"},"institutions":[{"id":"https://openalex.org/I4210120471","display_name":"Instituto de Telecomunica\u00e7\u00f5es","ror":"https://ror.org/02ht4fk33","country_code":"PT","type":"nonprofit","lineage":["https://openalex.org/I4210120471"]},{"id":"https://openalex.org/I60858718","display_name":"University of Aveiro","ror":"https://ror.org/00nt41z93","country_code":"PT","type":"education","lineage":["https://openalex.org/I60858718"]}],"countries":["PT"],"is_corresponding":false,"raw_author_name":"V\u00edtor Viegas","raw_affiliation_strings":["Instituto de Telecomunica\u00e7\u00f5es, Universidade de Aveiro, 1049-001 Lisbon, Portugal"],"affiliations":[{"raw_affiliation_string":"Instituto de Telecomunica\u00e7\u00f5es, Universidade de Aveiro, 1049-001 Lisbon, Portugal","institution_ids":["https://openalex.org/I4210120471","https://openalex.org/I60858718"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5067474890"],"corresponding_institution_ids":["https://openalex.org/I185571130"],"apc_list":{"value":1400,"currency":"CHF","value_usd":1515},"apc_paid":{"value":1400,"currency":"CHF","value_usd":1515},"fwci":13.5599,"has_fulltext":true,"cited_by_count":6,"citation_normalized_percentile":{"value":0.98475015,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"9","issue":"5","first_page":"137","last_page":"137"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.8184000253677368,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.8184000253677368,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/kazakh","display_name":"Kazakh","score":0.9032620787620544},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.6969329118728638},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.5944530963897705},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.5325941443443298},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5214335322380066},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4652380049228668},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.34991031885147095},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.17913952469825745},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.11578232049942017},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.09412628412246704},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.07018983364105225},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.06345438957214355}],"concepts":[{"id":"https://openalex.org/C2781297163","wikidata":"https://www.wikidata.org/wiki/Q9252","display_name":"Kazakh","level":2,"score":0.9032620787620544},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.6969329118728638},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.5944530963897705},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.5325941443443298},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5214335322380066},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4652380049228668},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.34991031885147095},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.17913952469825745},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.11578232049942017},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.09412628412246704},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.07018983364105225},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.06345438957214355},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.3390/bdcc9050137","is_oa":true,"landing_page_url":"https://doi.org/10.3390/bdcc9050137","pdf_url":"https://www.mdpi.com/2504-2289/9/5/137/pdf?version=1747738200","source":{"id":"https://openalex.org/S4210238752","display_name":"Big Data and Cognitive Computing","issn_l":"2504-2289","issn":["2504-2289"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Big Data and Cognitive Computing","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:2a87739d44e84aebae1322386ed69a60","is_oa":true,"landing_page_url":"https://doaj.org/article/2a87739d44e84aebae1322386ed69a60","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Big Data and Cognitive Computing, Vol 9, Iss 5, p 137 (2025)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.3390/bdcc9050137","is_oa":true,"landing_page_url":"https://doi.org/10.3390/bdcc9050137","pdf_url":"https://www.mdpi.com/2504-2289/9/5/137/pdf?version=1747738200","source":{"id":"https://openalex.org/S4210238752","display_name":"Big Data and Cognitive Computing","issn_l":"2504-2289","issn":["2504-2289"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Big Data and Cognitive Computing","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4410515544.pdf","grobid_xml":"https://content.openalex.org/works/W4410515544.grobid-xml"},"referenced_works_count":12,"referenced_works":["https://openalex.org/W3155806510","https://openalex.org/W4396689894","https://openalex.org/W4399547912","https://openalex.org/W4400680016","https://openalex.org/W4400765382","https://openalex.org/W4401358104","https://openalex.org/W4401636979","https://openalex.org/W4401974878","https://openalex.org/W4402670893","https://openalex.org/W4407811705","https://openalex.org/W4409361885","https://openalex.org/W4411630341"],"related_works":["https://openalex.org/W2363093443","https://openalex.org/W4206915444","https://openalex.org/W4388766676","https://openalex.org/W2353302181","https://openalex.org/W2362181825","https://openalex.org/W2320680668","https://openalex.org/W2901624366","https://openalex.org/W2519027122","https://openalex.org/W2900953188","https://openalex.org/W2371351259"],"abstract_inverted_index":{"Low-resource":[0],"languages":[1,199],"remain":[2],"underserved":[3],"by":[4,23,94,107],"contemporary":[5],"large":[6],"language":[7,89,194],"models":[8],"(LLMs)":[9],"because":[10],"they":[11],"lack":[12],"sizable":[13],"corpora,":[14],"bespoke":[15],"preprocessing":[16],"tools,":[17],"and":[18,56,65,77,109,134,150,152,160,169,175,180,200],"the":[19,71,161],"computing":[20],"budgets":[21],"assumed":[22],"mainstream":[24],"alignment":[25],"pipelines.":[26],"Focusing":[27],"on":[28],"Kazakh,":[29],"we":[30],"present":[31],"a":[32,59,67,127,188],"1.94B":[33],"parameter":[34],"LLaMA-based":[35],"model":[36,82,179],"that":[37,69,140,156],"demonstrates":[38],"how":[39],"strong,":[40],"culturally":[41],"aligned":[42],"performance":[43],"can":[44],"be":[45],"achieved":[46],"without":[47],"massive":[48],"infrastructure.":[49],"The":[50,178],"contribution":[51],"is":[52,83,98],"threefold.":[53],"(i)":[54],"Data":[55],"tokenization\u2014we":[57],"compile":[58],"rigorously":[60],"cleaned,":[61],"mixed-domain":[62],"Kazakh":[63],"corpus":[64],"design":[66],"tokenizer":[68],"respects":[70],"language\u2019s":[72],"agglutinative":[73],"morphology,":[74],"mixed-script":[75],"usage,":[76],"diacritics.":[78],"(ii)":[79],"Training":[80],"recipe\u2014the":[81],"built":[84],"in":[85],"two":[86],"stages:":[87],"causal":[88],"modeling":[90,195],"from":[91],"scratch":[92],"followed":[93],"instruction":[95,142],"tuning.":[96],"Alignment":[97],"further":[99],"refined":[100],"with":[101],"Direct":[102],"Preference":[103],"Optimization":[104],"(DPO),":[105],"extended":[106],"contrastive":[108],"entropy-based":[110],"regularization":[111],"to":[112,144,196],"stabilize":[113],"training":[114,159],"under":[115,184],"sparse,":[116],"noisy":[117],"preference":[118,132],"signals.":[119],"Two":[120],"complementary":[121],"resources":[122],"support":[123],"this":[124],"step:":[125],"ChatTune-DPO,":[126],"crowd-sourced":[128],"set":[129],"of":[130],"human":[131],"pairs,":[133],"Pseudo-DPO,":[135],"an":[136],"automatically":[137],"generated":[138],"alternative":[139],"repurposes":[141],"data":[143],"reduce":[145],"annotation":[146],"cost.":[147],"(iii)":[148],"Evaluation":[149],"impact\u2014qualitative":[151],"task-specific":[153],"assessments":[154],"show":[155],"targeted":[157],"monolingual":[158],"proposed":[162],"DPO":[163],"variant":[164],"markedly":[165],"improve":[166],"factuality,":[167],"coherence,":[168],"cultural":[170],"fidelity":[171],"over":[172],"baseline":[173],"instruction-only":[174],"multilingual":[176],"counterparts.":[177],"datasets":[181],"are":[182],"released":[183],"open":[185],"licenses,":[186],"offering":[187],"reproducible":[189],"blueprint":[190],"for":[191],"extending":[192],"state-of-the-art":[193],"other":[197],"under-represented":[198],"domains.":[201]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":5}],"updated_date":"2026-04-17T18:11:37.981687","created_date":"2025-10-10T00:00:00"}
