{"id":"https://openalex.org/W7140655213","doi":"https://doi.org/10.48550/arxiv.2603.23515","title":"Training a Large Language Model for Medical Coding Using Privacy-Preserving Synthetic Clinical Data","display_name":"Training a Large Language Model for Medical Coding Using Privacy-Preserving Synthetic Clinical Data","publication_year":2026,"publication_date":"2026-03-06","ids":{"openalex":"https://openalex.org/W7140655213","doi":"https://doi.org/10.48550/arxiv.2603.23515"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.23515","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23515","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.23515","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046386204","display_name":"John P. Cook","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cook, John","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130689504","display_name":"Michael Wyatt","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wyatt, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130679739","display_name":"Peng Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Peng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130641285","display_name":"Iris Chin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chin, Iris","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130634830","display_name":"Santosh Gupta","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gupta, Santosh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023332009","display_name":"Van Zyl van Vuuren","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Van Vuuren, Van Zyl","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066092150","display_name":"Richie Siburian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Siburian, Richie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002756321","display_name":"Amanda Spicer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Spicer, Amanda","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130663649","display_name":"Kristen Viviano","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Viviano, Kristen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130638002","display_name":"Alda Cami","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cami, Alda","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107809194","display_name":"Raunaq Malhotra","orcid":"https://orcid.org/0000-0002-7253-850X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Malhotra, Raunaq","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125681225","display_name":"Zhewei Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Zhewei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037534081","display_name":"Jeff Rasley","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rasley, Jeff","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5121028134","display_name":"Gaurav Kaushik","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaushik, Gaurav","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.6597999930381775,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.6597999930381775,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14400","display_name":"Medical Coding and Health Information","score":0.23749999701976776,"subfield":{"id":"https://openalex.org/subfields/3605","display_name":"Health Information Management"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.022600000724196434,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.6614999771118164},{"id":"https://openalex.org/keywords/medical-classification","display_name":"Medical classification","score":0.6187000274658203},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5069000124931335},{"id":"https://openalex.org/keywords/documentation","display_name":"Documentation","score":0.46389999985694885},{"id":"https://openalex.org/keywords/diagnosis-code","display_name":"Diagnosis code","score":0.4311000108718872},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.39329999685287476},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.3580000102519989},{"id":"https://openalex.org/keywords/health-insurance-portability-and-accountability-act","display_name":"Health Insurance Portability and Accountability Act","score":0.3434999883174896}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6668000221252441},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.6614999771118164},{"id":"https://openalex.org/C154874363","wikidata":"https://www.wikidata.org/wiki/Q3518464","display_name":"Medical classification","level":2,"score":0.6187000274658203},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5069000124931335},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.491100013256073},{"id":"https://openalex.org/C56666940","wikidata":"https://www.wikidata.org/wiki/Q788790","display_name":"Documentation","level":2,"score":0.46389999985694885},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.45010000467300415},{"id":"https://openalex.org/C45827449","wikidata":"https://www.wikidata.org/wiki/Q5270338","display_name":"Diagnosis code","level":3,"score":0.4311000108718872},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4235000014305115},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.39329999685287476},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3580000102519989},{"id":"https://openalex.org/C2778306010","wikidata":"https://www.wikidata.org/wiki/Q606563","display_name":"Health Insurance Portability and Accountability Act","level":3,"score":0.3434999883174896},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.33959999680519104},{"id":"https://openalex.org/C195910791","wikidata":"https://www.wikidata.org/wiki/Q1324077","display_name":"Medical record","level":2,"score":0.33410000801086426},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.33180001378059387},{"id":"https://openalex.org/C534262118","wikidata":"https://www.wikidata.org/wiki/Q177719","display_name":"Medical diagnosis","level":2,"score":0.30880001187324524},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C148524875","wikidata":"https://www.wikidata.org/wiki/Q6975395","display_name":"F1 score","level":2,"score":0.27149999141693115},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.26660001277923584},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.26510000228881836}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.23515","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23515","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.23515","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23515","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Improving":[0],"the":[1,26,138,154,193],"accuracy":[2],"and":[3,11,30,46,73,116,123,132,183,189,192,238],"reliability":[4],"of":[5,28,113,145],"medical":[6,60,71,94,199,220],"coding":[7,44,61,72,75,95,124,221,235],"reduces":[8],"clinician":[9],"burnout":[10],"supports":[12],"revenue":[13],"cycle":[14],"processes,":[15],"freeing":[16],"providers":[17],"to":[18,40,55,217],"focus":[19],"more":[20],"on":[21,111,153,174,198,240],"patient":[22],"care.":[23],"However,":[24,63],"automating":[25],"assignment":[27],"ICD-10-CM":[29,131],"CPT":[31],"codes":[32,118],"from":[33,103,120],"clinical":[34,114,181],"documentation":[35],"remains":[36],"a":[37,83,162,212,230],"challenge":[38],"due":[39],"heterogeneous":[41],"records,":[42],"nuanced":[43],"guidelines,":[45],"long-tail":[47],"distributions.":[48],"Large":[49],"language":[50,215],"models":[51,65],"have":[52],"been":[53],"proposed":[54],"help":[56],"or":[57],"automate":[58],"specific":[59,241],"tasks.":[62,201],"foundation":[64,86],"are":[66],"not":[67],"explicitly":[68],"trained":[69],"for":[70,91,130,147,233],"zero-shot":[74,135],"has":[76],"yielded":[77],"poor":[78],"results.":[79],"We":[80,107],"investigate":[81],"whether":[82],"modern":[84],"open-weight":[85],"model":[87,140,194,216],"can":[88,209],"be":[89],"adapted":[90],"an":[92,142],"expert-level":[93],"task":[96],"using":[97],"privacy-preserving":[98],"synthetic":[99,155],"training":[100,234],"data":[101,208],"derived":[102],"electronic":[104],"health":[105,225],"records.":[106],"fine-tune":[108],"Llama":[109],"3-70B":[110],"pairs":[112],"notes":[115],"gold":[117],"generated":[119],"EHR-grounded":[121],"templates":[122],"policies,":[125],"then":[126],"evaluate":[127],"exact-code":[128],"prediction":[129],"CPT.":[133],"A":[134],"baseline":[136],"with":[137],"unadapted":[139],"achieved":[141],"F1":[143,158],"score":[144],"0.18":[146],"exact":[148],"code":[149,168,184],"match.":[150],"After":[151],"fine-tuning":[152],"corpus,":[156],"exact-match":[157],"exceeded":[159],"0.70,":[160],"representing":[161],"large":[163,214],"absolute":[164],"gain":[165],"across":[166],"both":[167],"systems.":[169],"Notably,":[170],"performance":[171,197],"remained":[172],"high":[173],"complex":[175],"categories":[176],"that":[177,205,243],"often":[178],"require":[179],"multi-step":[180],"reasoning":[182],"composition,":[185],"including":[186],"Advanced":[187],"Illness":[188],"Frailty":[190],"classes,":[191],"retained":[195],"its":[196],"comprehension":[200],"These":[202],"results":[203],"indicate":[204],"synthetic,":[206],"policy-aware":[207],"efficiently":[210],"teach":[211],"general-purpose":[213],"support":[218],"precise":[219],"without":[222],"exposing":[223],"protected":[224],"information.":[226],"The":[227],"approach":[228],"offers":[229],"practical":[231],"path":[232],"agents":[236],"safely":[237],"iteratively":[239],"tasks":[242],"represent":[244],"real-world":[245],"populations.":[246]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-27T00:00:00"}
