{"id":"https://openalex.org/W7131077809","doi":"https://doi.org/10.48550/arxiv.2602.17949","title":"CUICurate: A GraphRAG-based Framework for Automated Clinical Concept Curation for NLP applications","display_name":"CUICurate: A GraphRAG-based Framework for Automated Clinical Concept Curation for NLP applications","publication_year":2026,"publication_date":"2026-02-20","ids":{"openalex":"https://openalex.org/W7131077809","doi":"https://doi.org/10.48550/arxiv.2602.17949"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.17949","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069115751","display_name":"Victoria Blake","orcid":"https://orcid.org/0009-0006-7006-7130"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Blake, Victoria","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126658492","display_name":"Mathew Miller","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Miller, Mathew","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049944000","display_name":"Jamie Novak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Novak, Jamie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048329390","display_name":"Sze\u2010Yuan Ooi","orcid":"https://orcid.org/0000-0001-6916-7958"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ooi, Sze-yuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5110237039","display_name":"Blanca Gallego","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gallego, Blanca","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5069115751"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.3596999943256378,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.3596999943256378,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3538999855518341,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.1518000066280365,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/unified-medical-language-system","display_name":"Unified Medical Language System","score":0.9563000202178955},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.604200005531311},{"id":"https://openalex.org/keywords/identifier","display_name":"Identifier","score":0.5198000073432922},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.515500009059906},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4593999981880188},{"id":"https://openalex.org/keywords/named-entity-recognition","display_name":"Named-entity recognition","score":0.4429999887943268},{"id":"https://openalex.org/keywords/controlled-vocabulary","display_name":"Controlled vocabulary","score":0.42590001225471497},{"id":"https://openalex.org/keywords/precision-and-recall","display_name":"Precision and recall","score":0.41749998927116394},{"id":"https://openalex.org/keywords/snomed-ct","display_name":"SNOMED CT","score":0.3580000102519989}],"concepts":[{"id":"https://openalex.org/C69505689","wikidata":"https://www.wikidata.org/wiki/Q455338","display_name":"Unified Medical Language System","level":2,"score":0.9563000202178955},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7967000007629395},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.7163000106811523},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.604200005531311},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5907999873161316},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5692999958992004},{"id":"https://openalex.org/C154504017","wikidata":"https://www.wikidata.org/wiki/Q853614","display_name":"Identifier","level":2,"score":0.5198000073432922},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.515500009059906},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4593999981880188},{"id":"https://openalex.org/C2779135771","wikidata":"https://www.wikidata.org/wiki/Q403574","display_name":"Named-entity recognition","level":3,"score":0.4429999887943268},{"id":"https://openalex.org/C110615152","wikidata":"https://www.wikidata.org/wiki/Q1469824","display_name":"Controlled vocabulary","level":2,"score":0.42590001225471497},{"id":"https://openalex.org/C81669768","wikidata":"https://www.wikidata.org/wiki/Q2359161","display_name":"Precision and recall","level":2,"score":0.41749998927116394},{"id":"https://openalex.org/C206497026","wikidata":"https://www.wikidata.org/wiki/Q1753883","display_name":"SNOMED CT","level":3,"score":0.3580000102519989},{"id":"https://openalex.org/C119839945","wikidata":"https://www.wikidata.org/wiki/Q6545185","display_name":"Unique identifier","level":3,"score":0.34700000286102295},{"id":"https://openalex.org/C2778698081","wikidata":"https://www.wikidata.org/wiki/Q179797","display_name":"Thesaurus","level":2,"score":0.3440000116825104},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.3271999955177307},{"id":"https://openalex.org/C100660578","wikidata":"https://www.wikidata.org/wiki/Q18733","display_name":"Recall","level":2,"score":0.3269999921321869},{"id":"https://openalex.org/C148524875","wikidata":"https://www.wikidata.org/wiki/Q6975395","display_name":"F1 score","level":2,"score":0.3183000087738037},{"id":"https://openalex.org/C91632574","wikidata":"https://www.wikidata.org/wiki/Q15088675","display_name":"Data curation","level":2,"score":0.314300000667572},{"id":"https://openalex.org/C2778143727","wikidata":"https://www.wikidata.org/wiki/Q1820650","display_name":"Readability","level":2,"score":0.2973000109195709},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.2957000136375427},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2939000129699707},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.29170000553131104},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.27480000257492065},{"id":"https://openalex.org/C547195049","wikidata":"https://www.wikidata.org/wiki/Q1725664","display_name":"Terminology","level":2,"score":0.2639999985694885},{"id":"https://openalex.org/C71472368","wikidata":"https://www.wikidata.org/wiki/Q676880","display_name":"Text mining","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.26260000467300415},{"id":"https://openalex.org/C3018949938","wikidata":"https://www.wikidata.org/wiki/Q17166101","display_name":"Text messaging","level":2,"score":0.26249998807907104},{"id":"https://openalex.org/C85407183","wikidata":"https://www.wikidata.org/wiki/Q1045785","display_name":"Semantic network","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.17949","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.17949","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.17949","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.17949","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8","score":0.47418904304504395}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Background:":[0],"Clinical":[1],"named":[2],"entity":[3],"recognition":[4],"tools":[5],"commonly":[6],"map":[7],"free":[8],"text":[9],"to":[10,204,233],"Unified":[11],"Medical":[12],"Language":[13],"System":[14],"(UMLS)":[15],"Concept":[16],"Unique":[17],"Identifiers":[18],"(CUIs).":[19],"For":[20,96],"many":[21],"downstream":[22],"tasks,":[23],"however,":[24],"the":[25,105,156,165,222],"clinically":[26],"meaningful":[27],"unit":[28],"is":[29,48],"not":[30],"a":[31,35,72,134,199],"single":[32],"CUI":[33],"but":[34],"concept":[36,46,81,140,153,207,227],"set":[37,82,208],"comprising":[38],"related":[39],"synonyms,":[40],"subtypes,":[41],"and":[42,52,91,114,121,138,150,193,201,240],"supertypes.":[43],"Constructing":[44],"such":[45],"sets":[47,154,228],"labour-intensive,":[49],"inconsistently":[50],"performed,":[51],"poorly":[53],"supported":[54],"by":[55,108],"existing":[56],"tools,":[57],"particularly":[58],"for":[59,78,93,237],"NLP":[60,235],"pipelines":[61,236],"that":[62,169,180,210,229],"operate":[63],"directly":[64],"on":[65,127],"UMLS":[66,80,85,206],"CUIs.":[67],"Methods":[68],"We":[69],"present":[70],"CUICurate,":[71],"Graph-based":[73],"retrieval-augmented":[74],"generation":[75],"(GraphRAG)":[76],"framework":[77,124,223],"automated":[79],"curation.":[83],"A":[84],"knowledge":[86],"graph":[87],"(KG)":[88],"was":[89,125],"constructed":[90],"embedded":[92],"semantic":[94],"retrieval.":[95],"each":[97],"target":[98],"concept,":[99],"candidate":[100,226],"CUIs":[101],"were":[102,188],"retrieved":[103],"from":[104],"KG,":[106],"followed":[107],"large":[109],"language":[110],"model":[111],"(LLM)":[112],"filtering":[113],"classification":[115],"steps":[116],"comparing":[117],"two":[118,166],"LLMs":[119,167],"(GPT-5":[120],"GPT-5-mini).":[122],"The":[123],"evaluated":[126],"five":[128],"lexically":[129],"heterogeneous":[130],"clinical":[131,234],"concepts":[132],"against":[133],"manually":[135],"curated":[136],"benchmark":[137],"gold-standard":[139],"sets.":[141],"Results":[142],"Across":[143],"all":[144],"concepts,":[145],"CUICurate":[146,197],"produced":[147,178],"substantially":[148,211],"larger":[149],"more":[151,181],"complete":[152],"than":[155],"manual":[157,213],"benchmarks":[158],"whilst":[159],"matching":[160],"human":[161],"precision.":[162],"Comparisons":[163],"between":[164],"found":[168],"GPT-5-mini":[170],"achieved":[171],"higher":[172],"recall":[173],"during":[174],"filtering,":[175],"while":[176],"GPT-5":[177],"classifications":[179],"closely":[182],"aligned":[183],"with":[184,219],"clinician":[185],"judgements.":[186],"Outputs":[187],"stable":[189],"across":[190],"repeated":[191],"runs":[192],"computationally":[194],"inexpensive.":[195],"Conclusions":[196],"offers":[198],"scalable":[200],"reproducible":[202],"approach":[203],"support":[205],"curation":[209],"reduces":[212],"effort.":[214],"By":[215],"integrating":[216],"graph-based":[217],"retrieval":[218],"LLM":[220],"reasoning,":[221],"produces":[224],"focused":[225],"can":[230],"be":[231],"adapted":[232],"different":[238],"phenotyping":[239],"analytic":[241],"requirements.":[242]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-24T00:00:00"}
