{"id":"https://openalex.org/W7138929010","doi":"https://doi.org/10.48550/arxiv.2603.15644","title":"Tokenization Tradeoffs in Structured EHR Foundation Models","display_name":"Tokenization Tradeoffs in Structured EHR Foundation Models","publication_year":2026,"publication_date":"2026-03-03","ids":{"openalex":"https://openalex.org/W7138929010","doi":"https://doi.org/10.48550/arxiv.2603.15644"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.15644","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15644","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.15644","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5079049199","display_name":"Lin Lawrence Guo","orcid":"https://orcid.org/0000-0002-2965-8442"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Guo, Lin Lawrence","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129995427","display_name":"Santiago Eduardo Arciniegas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Arciniegas, Santiago Eduardo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129774290","display_name":"Joseph Jihyung Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Joseph Jihyung","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009469008","display_name":"Adam P. Yan","orcid":"https://orcid.org/0000-0001-8300-3095"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Adam Paul","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050932952","display_name":"George Tomlinson","orcid":"https://orcid.org/0000-0002-9328-6399"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tomlinson, George","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130062033","display_name":"Jason Fries","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fries, Jason","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129852107","display_name":"Lillian Sung","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sung, Lillian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5079049199"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.9503999948501587,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.9503999948501587,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10350","display_name":"Electronic Health Records Systems","score":0.01899999938905239,"subfield":{"id":"https://openalex.org/subfields/3605","display_name":"Health Information Management"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11642","display_name":"Genomics and Rare Diseases","score":0.004399999976158142,"subfield":{"id":"https://openalex.org/subfields/1311","display_name":"Genetics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.7322999835014343},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.6082000136375427},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.558899998664856},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.524399995803833},{"id":"https://openalex.org/keywords/timeline","display_name":"Timeline","score":0.5232999920845032},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.4943000078201294},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.4593000113964081}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7836999893188477},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.7322999835014343},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.6082000136375427},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.558899998664856},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.524399995803833},{"id":"https://openalex.org/C4438859","wikidata":"https://www.wikidata.org/wiki/Q186117","display_name":"Timeline","level":2,"score":0.5232999920845032},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.4943000078201294},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.4593000113964081},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4584999978542328},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4120999872684479},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.396699994802475},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3718000054359436},{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.33489999175071716},{"id":"https://openalex.org/C2778355071","wikidata":"https://www.wikidata.org/wiki/Q1933849","display_name":"Microdata (statistics)","level":4,"score":0.30970001220703125},{"id":"https://openalex.org/C60048249","wikidata":"https://www.wikidata.org/wiki/Q37437","display_name":"Syntax","level":2,"score":0.30309998989105225},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.29820001125335693},{"id":"https://openalex.org/C147494362","wikidata":"https://www.wikidata.org/wiki/Q2078905","display_name":"Troubleshooting","level":2,"score":0.2971000075340271},{"id":"https://openalex.org/C147203929","wikidata":"https://www.wikidata.org/wiki/Q574814","display_name":"Discrete event simulation","level":2,"score":0.28139999508857727},{"id":"https://openalex.org/C5274069","wikidata":"https://www.wikidata.org/wiki/Q2285707","display_name":"Categorical variable","level":2,"score":0.2630000114440918}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.15644","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15644","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.15644","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15644","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Foundation":[0],"models":[1],"for":[2,191],"structured":[3],"electronic":[4],"health":[5],"records":[6],"(EHRs)":[7],"are":[8,27,139],"pretrained":[9,70],"on":[10,59,73,160],"longitudinal":[11],"sequences":[12],"of":[13,55,198],"timestamped":[14],"clinical":[15,97],"events":[16],"to":[17,131,154],"learn":[18,153],"adaptable":[19],"patient":[20],"representations.":[21],"Tokenization":[22],"--":[23,33],"how":[24,39],"these":[25],"timelines":[26],"converted":[28],"into":[29,141],"discrete":[30],"model":[31,151],"inputs":[32],"determines":[34],"what":[35],"information":[36],"is":[37,42],"preserved,":[38],"efficiently":[40],"it":[41],"encoded,":[43],"and":[44,62,88,103,111,117,178,196],"which":[45],"relationships":[46],"must":[47,152],"be":[48],"learned":[49],"versus":[50],"precomputed.":[51],"Yet":[52],"the":[53,127,150,194],"impact":[54],"tokenization":[56,82,186],"design":[57],"choices":[58],"downstream":[60],"performance":[61,195],"computational":[63],"efficiency":[64,197],"remains":[65],"largely":[66],"unexplored.":[67],"Here,":[68],"we":[69],"a":[71,78,188],"transformer":[72],"pediatric":[74],"EHR":[75,199],"data":[76],"under":[77],"factorial":[79],"design,":[80],"varying":[81],"along":[83],"event":[84,101],"encoding,":[85,87],"time":[86,105],"workflow":[89,179],"annotation.":[90],"We":[91],"evaluated":[92],"area-under-the-receiver-operating-characteristic":[93],"curve":[94],"across":[95,147],"74":[96],"prediction":[98],"tasks.":[99],"Joint":[100],"encoding":[102,106,129],"positional":[104],"outperformed":[107],"their":[108],"alternatives":[109],"(73/74":[110],"71/74":[112],"tasks)":[113],"while":[114,176],"requiring":[115],"39.5%":[116],"9.6%":[118],"fewer":[119],"pretraining":[120],"floating-point":[121],"operations,":[122],"respectively.":[123],"Targeted":[124],"ablations":[125],"traced":[126],"joint":[128],"advantage":[130,170],"local":[132],"binding":[133],"efficiency,":[134],"that":[135,149,168],"is,":[136],"code-attribute":[137],"pairs":[138],"combined":[140],"single":[142],"tokens,":[143],"rather":[144],"than":[145],"split":[146],"tokens":[148],"associate":[155],"during":[156],"pretraining.":[157],"External":[158],"evaluation":[159],"an":[161],"adult":[162],"intensive":[163],"care":[164],"unit":[165],"cohort":[166],"demonstrated":[167],"this":[169],"generalizes":[171],"despite":[172],"substantial":[173],"vocabulary":[174],"mismatch,":[175],"temporal":[177],"effects":[180],"remain":[181],"institution-specific.":[182],"These":[183],"results":[184],"establish":[185],"as":[187],"tractable":[189],"lever":[190],"improving":[192],"both":[193],"foundation":[200],"models.":[201]},"counts_by_year":[],"updated_date":"2026-05-01T08:36:08.643496","created_date":"2026-03-20T00:00:00"}
