{"id":"https://openalex.org/W4410038795","doi":"https://doi.org/10.3390/computers14050172","title":"A Framework for Domain-Specific Dataset Creation and Adaptation of Large Language Models","display_name":"A Framework for Domain-Specific Dataset Creation and Adaptation of Large Language Models","publication_year":2025,"publication_date":"2025-05-02","ids":{"openalex":"https://openalex.org/W4410038795","doi":"https://doi.org/10.3390/computers14050172"},"language":"en","primary_location":{"id":"doi:10.3390/computers14050172","is_oa":true,"landing_page_url":"https://doi.org/10.3390/computers14050172","pdf_url":"https://www.mdpi.com/2073-431X/14/5/172/pdf?version=1746170490","source":{"id":"https://openalex.org/S4210228075","display_name":"Computers","issn_l":"2073-431X","issn":["2073-431X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computers","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.mdpi.com/2073-431X/14/5/172/pdf?version=1746170490","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073623094","display_name":"George Balaskas","orcid":"https://orcid.org/0000-0002-6829-0441"},"institutions":[{"id":"https://openalex.org/I203474044","display_name":"National Centre of Scientific Research \"Demokritos\"","ror":"https://ror.org/038jp4m40","country_code":"GR","type":"facility","lineage":["https://openalex.org/I203474044"]},{"id":"https://openalex.org/I154757721","display_name":"University of Piraeus","ror":"https://ror.org/02qs84g94","country_code":"GR","type":"education","lineage":["https://openalex.org/I154757721"]},{"id":"https://openalex.org/I4387152169","display_name":"Institute of Informatics & Telecommunications","ror":"https://ror.org/0396t6k89","country_code":null,"type":"facility","lineage":["https://openalex.org/I203474044","https://openalex.org/I4387152169"]}],"countries":["GR"],"is_corresponding":true,"raw_author_name":"George Balaskas","raw_affiliation_strings":["Department of Digital Systems, University of Piraeus, Karaoli ke Dimitriou, 185 34 Pireas, Greece","Institute of Informatics and Telecommunications, NCSR Demokritos, Ag. Paraskevi, 153 41 Athens, Greece"],"affiliations":[{"raw_affiliation_string":"Department of Digital Systems, University of Piraeus, Karaoli ke Dimitriou, 185 34 Pireas, Greece","institution_ids":["https://openalex.org/I154757721"]},{"raw_affiliation_string":"Institute of Informatics and Telecommunications, NCSR Demokritos, Ag. Paraskevi, 153 41 Athens, Greece","institution_ids":["https://openalex.org/I203474044","https://openalex.org/I4387152169"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070253818","display_name":"Homer Papadopoulos","orcid":"https://orcid.org/0000-0002-3669-3094"},"institutions":[{"id":"https://openalex.org/I4210143585","display_name":"Syndesis (Greece)","ror":"https://ror.org/04sh23172","country_code":"GR","type":"company","lineage":["https://openalex.org/I4210143585"]},{"id":"https://openalex.org/I4387152169","display_name":"Institute of Informatics & Telecommunications","ror":"https://ror.org/0396t6k89","country_code":null,"type":"facility","lineage":["https://openalex.org/I203474044","https://openalex.org/I4387152169"]},{"id":"https://openalex.org/I203474044","display_name":"National Centre of Scientific Research \"Demokritos\"","ror":"https://ror.org/038jp4m40","country_code":"GR","type":"facility","lineage":["https://openalex.org/I203474044"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Homer Papadopoulos","raw_affiliation_strings":["Institute of Informatics and Telecommunications, NCSR Demokritos, Ag. Paraskevi, 153 41 Athens, Greece","Syndesis Ltd., Ag. Paraskevi, 153 41 Athens, Greece"],"affiliations":[{"raw_affiliation_string":"Institute of Informatics and Telecommunications, NCSR Demokritos, Ag. Paraskevi, 153 41 Athens, Greece","institution_ids":["https://openalex.org/I203474044","https://openalex.org/I4387152169"]},{"raw_affiliation_string":"Syndesis Ltd., Ag. Paraskevi, 153 41 Athens, Greece","institution_ids":["https://openalex.org/I4210143585"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002089816","display_name":"Dimitra Pappa","orcid":"https://orcid.org/0000-0002-4506-6690"},"institutions":[{"id":"https://openalex.org/I4387152169","display_name":"Institute of Informatics & Telecommunications","ror":"https://ror.org/0396t6k89","country_code":null,"type":"facility","lineage":["https://openalex.org/I203474044","https://openalex.org/I4387152169"]},{"id":"https://openalex.org/I203474044","display_name":"National Centre of Scientific Research \"Demokritos\"","ror":"https://ror.org/038jp4m40","country_code":"GR","type":"facility","lineage":["https://openalex.org/I203474044"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Dimitra Pappa","raw_affiliation_strings":["Institute of Informatics and Telecommunications, NCSR Demokritos, Ag. Paraskevi, 153 41 Athens, Greece"],"affiliations":[{"raw_affiliation_string":"Institute of Informatics and Telecommunications, NCSR Demokritos, Ag. Paraskevi, 153 41 Athens, Greece","institution_ids":["https://openalex.org/I203474044","https://openalex.org/I4387152169"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081192827","display_name":"Quentin Loisel","orcid":"https://orcid.org/0000-0003-0287-3908"},"institutions":[{"id":"https://openalex.org/I195939026","display_name":"Glasgow Caledonian University","ror":"https://ror.org/03dvm1235","country_code":"GB","type":"education","lineage":["https://openalex.org/I195939026"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Quentin Loisel","raw_affiliation_strings":["School of Health and Life Sciences, Glasgow Caledonian University, Cowcaddens Rd., Glasgow G4 0BA, UK"],"affiliations":[{"raw_affiliation_string":"School of Health and Life Sciences, Glasgow Caledonian University, Cowcaddens Rd., Glasgow G4 0BA, UK","institution_ids":["https://openalex.org/I195939026"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5015840397","display_name":"S\u00e9bastien Chastin","orcid":"https://orcid.org/0000-0003-1421-9348"},"institutions":[{"id":"https://openalex.org/I195939026","display_name":"Glasgow Caledonian University","ror":"https://ror.org/03dvm1235","country_code":"GB","type":"education","lineage":["https://openalex.org/I195939026"]},{"id":"https://openalex.org/I32597200","display_name":"Ghent University","ror":"https://ror.org/00cv9y106","country_code":"BE","type":"education","lineage":["https://openalex.org/I32597200"]}],"countries":["BE","GB"],"is_corresponding":false,"raw_author_name":"Sebastien Chastin","raw_affiliation_strings":["Department of Movement and Sports Science, Ghent University, BE-9000 Ghent, Belgium","School of Health and Life Sciences, Glasgow Caledonian University, Cowcaddens Rd., Glasgow G4 0BA, UK"],"affiliations":[{"raw_affiliation_string":"Department of Movement and Sports Science, Ghent University, BE-9000 Ghent, Belgium","institution_ids":["https://openalex.org/I32597200"]},{"raw_affiliation_string":"School of Health and Life Sciences, Glasgow Caledonian University, Cowcaddens Rd., Glasgow G4 0BA, UK","institution_ids":["https://openalex.org/I195939026"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5073623094"],"corresponding_institution_ids":["https://openalex.org/I154757721","https://openalex.org/I203474044","https://openalex.org/I4387152169"],"apc_list":{"value":1600,"currency":"CHF","value_usd":1732},"apc_paid":{"value":1600,"currency":"CHF","value_usd":1732},"fwci":13.7265,"has_fulltext":true,"cited_by_count":5,"citation_normalized_percentile":{"value":0.98449229,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"14","issue":"5","first_page":"172","last_page":"172"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.8902000188827515,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.8902000188827515,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.8115000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/domain-adaptation","display_name":"Domain adaptation","score":0.7498410940170288},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.7351723313331604},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6174610257148743},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.5882870554924011},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4386020302772522},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3569169342517853},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.176355242729187},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.0613553524017334},{"id":"https://openalex.org/keywords/neuroscience","display_name":"Neuroscience","score":0.04728272557258606}],"concepts":[{"id":"https://openalex.org/C2776434776","wikidata":"https://www.wikidata.org/wiki/Q19246213","display_name":"Domain adaptation","level":3,"score":0.7498410940170288},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.7351723313331604},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6174610257148743},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.5882870554924011},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4386020302772522},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3569169342517853},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.176355242729187},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0613553524017334},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.04728272557258606},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.3390/computers14050172","is_oa":true,"landing_page_url":"https://doi.org/10.3390/computers14050172","pdf_url":"https://www.mdpi.com/2073-431X/14/5/172/pdf?version=1746170490","source":{"id":"https://openalex.org/S4210228075","display_name":"Computers","issn_l":"2073-431X","issn":["2073-431X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computers","raw_type":"journal-article"},{"id":"pmh:oai:archive.ugent.be:01KHV661BP3JNET4M5PKR58R5B","is_oa":true,"landing_page_url":"https://biblio.ugent.be/publication/01KHV661BP3JNET4M5PKR58R5B","pdf_url":null,"source":{"id":"https://openalex.org/S4306400478","display_name":"Ghent University Academic Bibliography (Ghent University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I32597200","host_organization_name":"Ghent University","host_organization_lineage":["https://openalex.org/I32597200"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ISSN: 2073-431X","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:researchonline.gcu.ac.uk:openaire/d4a670a4-f8b5-4dd8-8a58-b696efb033b5","is_oa":true,"landing_page_url":"https://researchonline.gcu.ac.uk/en/publications/d4a670a4-f8b5-4dd8-8a58-b696efb033b5","pdf_url":null,"source":{"id":"https://openalex.org/S4306402566","display_name":"ResearchOnline (Glasgow Caledonian University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I195939026","host_organization_name":"Glasgow Caledonian University","host_organization_lineage":["https://openalex.org/I195939026"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":"Balaskas, G, Papadopoulos, H, Pappa, D, Loisel, Q & Chastin, S 2025, 'A framework for domain-specific dataset creation and adaptation of large language models', Computers, vol. 14, no. 5, 172. https://doi.org/10.3390/computers14050172","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:doaj.org/article:59ccc0116c89423ab69607b977727dcf","is_oa":true,"landing_page_url":"https://doaj.org/article/59ccc0116c89423ab69607b977727dcf","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Computers, Vol 14, Iss 5, p 172 (2025)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.3390/computers14050172","is_oa":true,"landing_page_url":"https://doi.org/10.3390/computers14050172","pdf_url":"https://www.mdpi.com/2073-431X/14/5/172/pdf?version=1746170490","source":{"id":"https://openalex.org/S4210228075","display_name":"Computers","issn_l":"2073-431X","issn":["2073-431X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computers","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1974542962","display_name":null,"funder_award_id":"Sk\u0142odowska","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G2689612763","display_name":null,"funder_award_id":"Marie","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G825337453","display_name":null,"funder_award_id":"956501","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G8318064016","display_name":null,"funder_award_id":"Horizon","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"}],"funders":[{"id":"https://openalex.org/F4320320300","display_name":"European Commission","ror":"https://ror.org/00k4n6c32"},{"id":"https://openalex.org/F4320335551","display_name":"Erasmus+","ror":null}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4410038795.pdf","grobid_xml":"https://content.openalex.org/works/W4410038795.grobid-xml"},"referenced_works_count":17,"referenced_works":["https://openalex.org/W2052312648","https://openalex.org/W2101105183","https://openalex.org/W2979826702","https://openalex.org/W3201174429","https://openalex.org/W4317898419","https://openalex.org/W4378906446","https://openalex.org/W4386566496","https://openalex.org/W4387321091","https://openalex.org/W4387642400","https://openalex.org/W4388196756","https://openalex.org/W4392748121","https://openalex.org/W4401042689","https://openalex.org/W4401042993","https://openalex.org/W6682631176","https://openalex.org/W6777615688","https://openalex.org/W6810738896","https://openalex.org/W6852418670"],"related_works":["https://openalex.org/W4394775207","https://openalex.org/W4389474468","https://openalex.org/W3204019825","https://openalex.org/W4300172004","https://openalex.org/W3203792196","https://openalex.org/W4321649381","https://openalex.org/W2997645659","https://openalex.org/W3180787869","https://openalex.org/W2955455867","https://openalex.org/W4295929828"],"abstract_inverted_index":{"This":[0,41],"paper":[1],"introduces":[2],"a":[3,28,132,166],"novel":[4],"framework":[5,26,112,141,164,179],"for":[6,46,137,175,187],"addressing":[7],"domain":[8,185],"adaptation":[9,158,186],"challenges":[10],"in":[11,62,170],"large":[12],"language":[13],"models":[14,72,86,145],"(LLMs),":[15],"emphasising":[16],"privacy-preserving":[17],"synthetic":[18],"data":[19,52],"generation":[20],"and":[21,37,67,104,119,134,146,152,189],"efficient":[22],"fine-tuning.":[23],"The":[24,140,178],"proposed":[25],"employs":[27],"multi-stage":[29],"approach":[30],"that":[31,71],"includes":[32],"document":[33],"ingestion,":[34],"relevance":[35],"assessment,":[36],"automated":[38],"dataset":[39],"creation.":[40],"process":[42],"reduces":[43],"the":[44,56,111,171,182,197],"need":[45],"extensive":[47,161,203],"technical":[48,206],"expertise":[49],"while":[50],"safeguarding":[51],"privacy.":[53],"We":[54],"evaluate":[55],"framework\u2019s":[57],"performance":[58],"on":[59],"domain-specific":[60,157],"tasks":[61],"fields":[63],"such":[64,123],"as":[65,98,124],"biobanking":[66],"public":[68],"health,":[69],"demonstrating":[70],"fine-tuned":[73],"using":[74],"our":[75],"method":[76],"achieve":[77],"results":[78],"comparable":[79],"to":[80,95,184,195],"larger":[81],"proprietary":[82],"models.":[83],"Crucially,":[84],"these":[85],"maintain":[87],"their":[88],"general":[89],"instruction-following":[90],"capabilities,":[91],"even":[92],"when":[93],"adapted":[94],"specialised":[96,176],"domains,":[97],"shown":[99],"through":[100],"experiments":[101],"with":[102],"7B":[103],"8B":[105],"parameter":[106],"LLMs.":[107,139],"Key":[108],"components":[109],"of":[110,173,199],"include":[113],"continuous":[114],"pre-training,":[115],"supervised":[116],"fine-tuning":[117],"(SFT),":[118],"reinforcement":[120],"learning":[121],"methods":[122],"direct":[125],"preference":[126],"optimisation":[127],"(DPO),":[128],"which":[129],"together":[130],"provide":[131],"flexible":[133],"configurable":[135],"solution":[136],"deploying":[138],"supports":[142],"both":[143],"local":[144],"API-based":[147],"solutions,":[148],"making":[149],"it":[150],"scalable":[151],"accessible.":[153],"By":[154],"enabling":[155,193],"privacy-preserving,":[156],"without":[159,201],"requiring":[160,202],"expertise,":[162],"this":[163],"represents":[165],"significant":[167],"step":[168],"forward":[169],"deployment":[172],"LLMs":[174,200],"applications.":[177],"significantly":[180],"lowers":[181],"barrier":[183],"small-":[188],"medium-sized":[190],"enterprises":[191],"(SMEs),":[192],"them":[194],"utilise":[196],"power":[198],"resources":[204],"or":[205],"expertise.":[207]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":3}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
