{"id":"https://openalex.org/W7133319876","doi":"https://doi.org/10.48550/arxiv.2603.00221","title":"A medical coding language model trained on clinical narratives from a population-wide cohort of 1.8 million patients","display_name":"A medical coding language model trained on clinical narratives from a population-wide cohort of 1.8 million patients","publication_year":2026,"publication_date":"2026-02-27","ids":{"openalex":"https://openalex.org/W7133319876","doi":"https://doi.org/10.48550/arxiv.2603.00221"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.00221","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00221","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.00221","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061228375","display_name":"Joakim Edin","orcid":"https://orcid.org/0000-0003-1005-8276"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Edin, Joakim","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013665148","display_name":"Sedrah B. Balaganeshan","orcid":"https://orcid.org/0000-0002-0275-7440"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Balaganeshan, Sedrah Butt","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127956952","display_name":"Annike Kj\u00f8lby Kristensen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kristensen, Annike Kj\u00f8lby","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127923795","display_name":"Lars Maal\u00f8e","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Maal\u00f8e, Lars","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092012744","display_name":"Ioannis Louloudis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Louloudis, Ioannis","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124047323","display_name":"S\u00f8ren Brunak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brunak, S\u00f8ren","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5061228375"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T14400","display_name":"Medical Coding and Health Information","score":0.6491000056266785,"subfield":{"id":"https://openalex.org/subfields/3605","display_name":"Health Information Management"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T14400","display_name":"Medical Coding and Health Information","score":0.6491000056266785,"subfield":{"id":"https://openalex.org/subfields/3605","display_name":"Health Information Management"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.21850000321865082,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12246","display_name":"Chronic Disease Management Strategies","score":0.06159999966621399,"subfield":{"id":"https://openalex.org/subfields/2713","display_name":"Epidemiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/medical-diagnosis","display_name":"Medical diagnosis","score":0.7397000193595886},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.6480000019073486},{"id":"https://openalex.org/keywords/medical-classification","display_name":"Medical classification","score":0.6358000040054321},{"id":"https://openalex.org/keywords/diagnosis-code","display_name":"Diagnosis code","score":0.6319000124931335},{"id":"https://openalex.org/keywords/reimbursement","display_name":"Reimbursement","score":0.5814999938011169},{"id":"https://openalex.org/keywords/cohort","display_name":"Cohort","score":0.5771999955177307},{"id":"https://openalex.org/keywords/medical-record","display_name":"Medical record","score":0.5759999752044678},{"id":"https://openalex.org/keywords/specialty","display_name":"Specialty","score":0.5579000115394592},{"id":"https://openalex.org/keywords/documentation","display_name":"Documentation","score":0.5415999889373779},{"id":"https://openalex.org/keywords/public-health","display_name":"Public health","score":0.4781999886035919}],"concepts":[{"id":"https://openalex.org/C534262118","wikidata":"https://www.wikidata.org/wiki/Q177719","display_name":"Medical diagnosis","level":2,"score":0.7397000193595886},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.6480000019073486},{"id":"https://openalex.org/C154874363","wikidata":"https://www.wikidata.org/wiki/Q3518464","display_name":"Medical classification","level":2,"score":0.6358000040054321},{"id":"https://openalex.org/C45827449","wikidata":"https://www.wikidata.org/wiki/Q5270338","display_name":"Diagnosis code","level":3,"score":0.6319000124931335},{"id":"https://openalex.org/C2779703844","wikidata":"https://www.wikidata.org/wiki/Q760125","display_name":"Reimbursement","level":3,"score":0.5814999938011169},{"id":"https://openalex.org/C72563966","wikidata":"https://www.wikidata.org/wiki/Q1303415","display_name":"Cohort","level":2,"score":0.5771999955177307},{"id":"https://openalex.org/C195910791","wikidata":"https://www.wikidata.org/wiki/Q1324077","display_name":"Medical record","level":2,"score":0.5759999752044678},{"id":"https://openalex.org/C20387591","wikidata":"https://www.wikidata.org/wiki/Q930752","display_name":"Specialty","level":2,"score":0.5579000115394592},{"id":"https://openalex.org/C56666940","wikidata":"https://www.wikidata.org/wiki/Q788790","display_name":"Documentation","level":2,"score":0.5415999889373779},{"id":"https://openalex.org/C138816342","wikidata":"https://www.wikidata.org/wiki/Q189603","display_name":"Public health","level":2,"score":0.4781999886035919},{"id":"https://openalex.org/C100660578","wikidata":"https://www.wikidata.org/wiki/Q18733","display_name":"Recall","level":2,"score":0.46549999713897705},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.44589999318122864},{"id":"https://openalex.org/C160735492","wikidata":"https://www.wikidata.org/wiki/Q31207","display_name":"Health care","level":2,"score":0.384799987077713},{"id":"https://openalex.org/C509550671","wikidata":"https://www.wikidata.org/wiki/Q126945","display_name":"Medical education","level":1,"score":0.38359999656677246},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.3630000054836273},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3549000024795532},{"id":"https://openalex.org/C107130276","wikidata":"https://www.wikidata.org/wiki/Q133805","display_name":"Epidemiology","level":2,"score":0.35350000858306885},{"id":"https://openalex.org/C512399662","wikidata":"https://www.wikidata.org/wiki/Q3505712","display_name":"Family medicine","level":1,"score":0.3531000018119812},{"id":"https://openalex.org/C2780932742","wikidata":"https://www.wikidata.org/wiki/Q4915239","display_name":"Biorepository","level":3,"score":0.34599998593330383},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3287999927997589},{"id":"https://openalex.org/C201903717","wikidata":"https://www.wikidata.org/wiki/Q1778788","display_name":"Cohort study","level":2,"score":0.3276999890804291},{"id":"https://openalex.org/C2778143727","wikidata":"https://www.wikidata.org/wiki/Q1820650","display_name":"Readability","level":2,"score":0.32580000162124634},{"id":"https://openalex.org/C206497026","wikidata":"https://www.wikidata.org/wiki/Q1753883","display_name":"SNOMED CT","level":3,"score":0.3237999975681305},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3224000036716461},{"id":"https://openalex.org/C3020144179","wikidata":"https://www.wikidata.org/wiki/Q10871684","display_name":"Electronic health record","level":3,"score":0.3203999996185303},{"id":"https://openalex.org/C3019952477","wikidata":"https://www.wikidata.org/wiki/Q1324077","display_name":"Health records","level":3,"score":0.3165000081062317},{"id":"https://openalex.org/C2779473830","wikidata":"https://www.wikidata.org/wiki/Q1540899","display_name":"MEDLINE","level":2,"score":0.3118000030517578},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2971999943256378},{"id":"https://openalex.org/C2778306010","wikidata":"https://www.wikidata.org/wiki/Q606563","display_name":"Health Insurance Portability and Accountability Act","level":3,"score":0.295199990272522},{"id":"https://openalex.org/C145642194","wikidata":"https://www.wikidata.org/wiki/Q870895","display_name":"Health informatics","level":3,"score":0.2937999963760376},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.28690001368522644},{"id":"https://openalex.org/C2776361769","wikidata":"https://www.wikidata.org/wiki/Q821775","display_name":"Public health surveillance","level":3,"score":0.27390000224113464},{"id":"https://openalex.org/C545542383","wikidata":"https://www.wikidata.org/wiki/Q2751242","display_name":"Medical emergency","level":1,"score":0.2703000009059906},{"id":"https://openalex.org/C3018060332","wikidata":"https://www.wikidata.org/wiki/Q10871684","display_name":"Electronic medical record","level":2,"score":0.259799987077713},{"id":"https://openalex.org/C148524875","wikidata":"https://www.wikidata.org/wiki/Q6975395","display_name":"F1 score","level":2,"score":0.25839999318122864},{"id":"https://openalex.org/C2987496018","wikidata":"https://www.wikidata.org/wiki/Q1860","display_name":"English language","level":2,"score":0.2578999996185303},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.25220000743865967}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.00221","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00221","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.00221","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00221","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Medical":[0],"coding":[1,16,195],"translates":[2],"clinical":[3,62],"documentation":[4],"into":[5],"standardized":[6],"codes":[7,60,116],"for":[8,162,196,205],"billing,":[9],"research,":[10,164],"and":[11,19,65,81,121,168,175,201,208],"public":[12,165],"health,":[13],"but":[14],"manual":[15,137],"is":[17],"time-consuming":[18],"error-prone.":[20],"Existing":[21],"automation":[22],"efforts":[23],"rely":[24],"on":[25,39,69],"small":[26],"datasets":[27],"that":[28],"poorly":[29],"represent":[30],"real-world":[31],"patient":[32],"heterogeneity.":[33],"We":[34],"trained":[35],"a":[36,76,82,211],"language":[37],"model":[38,74,124,144,192],"5.8":[40],"million":[41,47],"electronic":[42],"health":[43,166],"records":[44],"from":[45,61],"1.8":[46],"patients":[48],"across":[49],"nearly":[50],"all":[51],"specialties":[52,97],"in":[53,96,153,178],"Eastern":[54,154],"Denmark":[55,155],"(2006--2016)":[56],"to":[57,188,214],"predict":[58],"ICD-10":[59],"notes,":[63],"medications,":[64],"laboratory":[66],"results.":[67],"Evaluated":[68],"270,000":[70],"held-out":[71],"patients,":[72],"the":[73,123],"achieved":[75],"micro":[77],"F1":[78,111],"of":[79,85,127,130,150,170,199],"71.8%":[80],"top-10":[83],"recall":[84],"95.5%.":[86],"Performance":[87],"varied":[88],"by":[89],"specialty":[90],"(F1:":[91],"53--91%),":[92],"with":[93,98,159],"higher":[94],"scores":[95],"well-defined":[99],"diagnostic":[100],"criteria.":[101],"Codes":[102],"appearing":[103],"predominantly":[104],"as":[105],"secondary":[106,151,218],"diagnoses":[107,152],"had":[108],"markedly":[109],"lower":[110],"scores.":[112],"For":[113],"three":[114],"such":[115],"(suicide-related":[117],"behaviors,":[118],"weight":[119],"disorders,":[120],"hypertension),":[122],"identified":[125],"thousands":[126],"uncoded":[128],"cases,":[129],"which":[131],"76-86%":[132],"were":[133],"confirmed":[134],"valid":[135],"upon":[136],"review,":[138],"suggesting":[139],"systematic":[140],"under-coding":[141,149],"rather":[142],"than":[143],"error.":[145],"These":[146],"findings":[147],"suggest":[148,182],"during":[156],"this":[157,183,189],"period,":[158],"potential":[160],"implications":[161],"epidemiological":[163],"surveillance,":[167],"understanding":[169],"multimorbidity.":[171],"Similar":[172],"time":[173],"constraints":[174],"reimbursement":[176],"structures":[177],"other":[179],"healthcare":[180],"systems":[181],"may":[184,209],"not":[185],"be":[186],"isolated":[187],"dataset.":[190],"The":[191],"can":[193],"automate":[194],"approximately":[197],"50%":[198],"cases":[200],"provide":[202],"accurate":[203],"suggestions":[204],"most":[206],"others,":[207],"offer":[210],"practical":[212],"solution":[213],"help":[215],"capture":[216],"missed":[217],"conditions.":[219]},"counts_by_year":[],"updated_date":"2026-03-04T07:09:34.246503","created_date":"2026-03-04T00:00:00"}
