{"id":"https://openalex.org/W7131446572","doi":"https://doi.org/10.48550/arxiv.2602.20324","title":"An artificial intelligence framework for end-to-end rare disease phenotyping from clinical notes using large language models","display_name":"An artificial intelligence framework for end-to-end rare disease phenotyping from clinical notes using large language models","publication_year":2026,"publication_date":"2026-02-23","ids":{"openalex":"https://openalex.org/W7131446572","doi":"https://doi.org/10.48550/arxiv.2602.20324"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.20324","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5119538409","display_name":"Cathy Shyr","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shyr, Cathy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126825566","display_name":"Yan Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Yan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107290274","display_name":"Rory J Tinker","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tinker, Rory J.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126787821","display_name":"Thomas A. Cassini","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cassini, Thomas A.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062044422","display_name":"Kevin Byram","orcid":"https://orcid.org/0000-0001-6441-1915"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Byram, Kevin W.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102324607","display_name":"Rizwan Hamid","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hamid, Rizwan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029954233","display_name":"Daniel Fabbri","orcid":"https://orcid.org/0000-0003-0530-2510"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fabbri, Daniel V.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001976163","display_name":"Adam Wright","orcid":"https://orcid.org/0000-0003-1621-1622"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wright, Adam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090773076","display_name":"Josh F. Peterson","orcid":"https://orcid.org/0000-0002-7553-0749"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peterson, Josh F.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126815536","display_name":"Lisa Bastarache","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bastarache, Lisa","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126852005","display_name":"Hua Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Hua","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5119538409"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11642","display_name":"Genomics and Rare Diseases","score":0.8546000123023987,"subfield":{"id":"https://openalex.org/subfields/1311","display_name":"Genetics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T11642","display_name":"Genomics and Rare Diseases","score":0.8546000123023987,"subfield":{"id":"https://openalex.org/subfields/1311","display_name":"Genetics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.12020000070333481,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.006300000008195639,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.6568999886512756},{"id":"https://openalex.org/keywords/unified-medical-language-system","display_name":"Unified Medical Language System","score":0.4747999906539917},{"id":"https://openalex.org/keywords/standardization","display_name":"Standardization","score":0.420199990272522},{"id":"https://openalex.org/keywords/rare-disease","display_name":"Rare disease","score":0.41589999198913574},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.3856000006198883},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.37389999628067017},{"id":"https://openalex.org/keywords/medical-diagnosis","display_name":"Medical diagnosis","score":0.36570000648498535},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.35679998993873596}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7121000289916992},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.6568999886512756},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6187000274658203},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5543000102043152},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5526999831199646},{"id":"https://openalex.org/C69505689","wikidata":"https://www.wikidata.org/wiki/Q455338","display_name":"Unified Medical Language System","level":2,"score":0.4747999906539917},{"id":"https://openalex.org/C188087704","wikidata":"https://www.wikidata.org/wiki/Q369577","display_name":"Standardization","level":2,"score":0.420199990272522},{"id":"https://openalex.org/C2779701055","wikidata":"https://www.wikidata.org/wiki/Q929833","display_name":"Rare disease","level":3,"score":0.41589999198913574},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.3856000006198883},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.37389999628067017},{"id":"https://openalex.org/C534262118","wikidata":"https://www.wikidata.org/wiki/Q177719","display_name":"Medical diagnosis","level":2,"score":0.36570000648498535},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.35679998993873596},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.34150001406669617},{"id":"https://openalex.org/C2779134260","wikidata":"https://www.wikidata.org/wiki/Q12136","display_name":"Disease","level":2,"score":0.3206000030040741},{"id":"https://openalex.org/C9354725","wikidata":"https://www.wikidata.org/wiki/Q286017","display_name":"Operationalization","level":2,"score":0.305400013923645},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.30480000376701355},{"id":"https://openalex.org/C3020646490","wikidata":"https://www.wikidata.org/wiki/Q25203551","display_name":"Clinical phenotype","level":4,"score":0.3037000000476837},{"id":"https://openalex.org/C163763905","wikidata":"https://www.wikidata.org/wiki/Q17075943","display_name":"Precision medicine","level":2,"score":0.299699991941452},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.29499998688697815},{"id":"https://openalex.org/C25810664","wikidata":"https://www.wikidata.org/wiki/Q44325","display_name":"Ontology","level":2,"score":0.29159998893737793},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.28209999203681946},{"id":"https://openalex.org/C40993552","wikidata":"https://www.wikidata.org/wiki/Q514654","display_name":"Gold standard (test)","level":2,"score":0.2718999981880188},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.2632000148296356},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.25360000133514404},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.25200000405311584}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.20324","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.20324","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.20324","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.20324","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Decent work and economic growth","score":0.5538263320922852,"id":"https://metadata.un.org/sdg/8"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Phenotyping":[0],"is":[1,16],"fundamental":[2],"to":[3,20,48,80,209],"rare":[4,68,212],"disease":[5,69,213],"diagnosis,":[6],"but":[7,32],"manual":[8],"curation":[9,204],"of":[10,30,40,86,148,160,172],"structured":[11],"phenotypes":[12,197],"from":[13,43,95,114],"clinical":[14,38,44,103,112,176],"notes":[15,113],"labor-intensive":[17],"and":[18,54,83,105,139,167,205],"difficult":[19],"scale.":[21],"Existing":[22],"artificial":[23],"intelligence":[24],"approaches":[25],"typically":[26],"optimize":[27],"individual":[28],"components":[29],"phenotyping":[31,70,177,181],"do":[33],"not":[34],"operationalize":[35],"the":[36,124,158,170,174,207],"full":[37,175],"workflow":[39,186],"extracting":[41],"features":[42],"text,":[45],"standardizing":[46],"them":[47],"Human":[49],"Phenotype":[50],"Ontology":[51],"(HPO)":[52],"terms,":[53,82],"prioritizing":[55],"diagnostically":[56,87],"informative":[57,88],"HPO":[58,81,121],"terms.":[59],"We":[60,90],"developed":[61],"RARE-PHENIX,":[62],"an":[63],"end-to-end":[64,143],"AI":[65],"framework":[66],"for":[67],"that":[71,198],"integrates":[72],"large":[73],"language":[74],"model-based":[75],"phenotype":[76],"extraction,":[77],"ontology-grounded":[78],"standardization":[79],"supervised":[84],"ranking":[85],"phenotypes.":[89],"trained":[91],"RARE-PHENIX":[92,127,164,193],"using":[93],"data":[94],"2,671":[96],"patients":[97],"across":[98,136],"11":[99],"Undiagnosed":[100],"Diseases":[101],"Network":[102],"sites,":[104],"externally":[106],"validated":[107],"it":[108],"on":[109],"16,357":[110],"real-world":[111,216],"Vanderbilt":[115],"University":[116],"Medical":[117],"Center.":[118],"Using":[119],"clinician-curated":[120],"terms":[122],"as":[123,182],"gold":[125],"standard,":[126],"consistently":[128],"outperformed":[129],"a":[130,183,189],"state-of-the-art":[131],"deep":[132],"learning":[133],"baseline":[134],"(PhenoBERT)":[135],"ontology-based":[137,146],"similarity":[138,147],"precision-recall-F1":[140],"metrics":[141],"in":[142,163,215],"evaluation":[144],"(i.e.,":[145],"0.70":[149],"vs.":[150],"0.58).":[151],"Ablation":[152],"analyses":[153],"demonstrated":[154],"performance":[155],"improvements":[156],"with":[157,202],"addition":[159],"each":[161],"module":[162],"(extraction,":[165],"standardization,":[166],"prioritization),":[168],"supporting":[169],"value":[171],"modeling":[173,180],"workflow.":[178],"By":[179],"clinically":[184],"aligned":[185],"rather":[187],"than":[188],"single":[190],"extraction":[191],"task,":[192],"provides":[194],"structured,":[195],"ranked":[196],"are":[199],"more":[200],"concordant":[201],"clinician":[203],"has":[206],"potential":[208],"support":[210],"human-in-the-loop":[211],"diagnosis":[214],"settings.":[217]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-26T00:00:00"}
