{"id":"https://openalex.org/W4416676944","doi":"https://doi.org/10.1109/dsaa65442.2025.11247965","title":"Towards LLM-Guided Healthcare Dataset Harmonization","display_name":"Towards LLM-Guided Healthcare Dataset Harmonization","publication_year":2025,"publication_date":"2025-10-09","ids":{"openalex":"https://openalex.org/W4416676944","doi":"https://doi.org/10.1109/dsaa65442.2025.11247965"},"language":null,"primary_location":{"id":"doi:10.1109/dsaa65442.2025.11247965","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dsaa65442.2025.11247965","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 12th International Conference on Data Science and Advanced Analytics (DSAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023796920","display_name":"Christos Smailis","orcid":null},"institutions":[{"id":"https://openalex.org/I4210118299","display_name":"Institute for Biomedicine","ror":"https://ror.org/02hsggv49","country_code":"IT","type":"facility","lineage":["https://openalex.org/I1319360392","https://openalex.org/I4210118299"]}],"countries":["IT"],"is_corresponding":true,"raw_author_name":"Christos Smailis","raw_affiliation_strings":["University of Houston,Computational Biomedicine Lab,Department of Computer Science,Texas,USA"],"affiliations":[{"raw_affiliation_string":"University of Houston,Computational Biomedicine Lab,Department of Computer Science,Texas,USA","institution_ids":["https://openalex.org/I4210118299"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048296533","display_name":"Ordonez Carlos","orcid":null},"institutions":[{"id":"https://openalex.org/I4210099593","display_name":"Computer Algorithms for Medicine","ror":"https://ror.org/00zky6d38","country_code":"AT","type":"facility","lineage":["https://openalex.org/I4210099593"]}],"countries":["AT"],"is_corresponding":false,"raw_author_name":"Carlos Ordonez","raw_affiliation_strings":["University of Houston,Data-Intensive Parallel Algorithms for AI,Department of Computer Science,Texas,USA"],"affiliations":[{"raw_affiliation_string":"University of Houston,Data-Intensive Parallel Algorithms for AI,Department of Computer Science,Texas,USA","institution_ids":["https://openalex.org/I4210099593"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101875396","display_name":"Ioannis A. Kakadiaris","orcid":"https://orcid.org/0000-0001-5983-7268"},"institutions":[{"id":"https://openalex.org/I4210118299","display_name":"Institute for Biomedicine","ror":"https://ror.org/02hsggv49","country_code":"IT","type":"facility","lineage":["https://openalex.org/I1319360392","https://openalex.org/I4210118299"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Ioannis A. Kakadiaris","raw_affiliation_strings":["University of Houston,Computational Biomedicine Lab,Department of Computer Science,Texas,USA"],"affiliations":[{"raw_affiliation_string":"University of Houston,Computational Biomedicine Lab,Department of Computer Science,Texas,USA","institution_ids":["https://openalex.org/I4210118299"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5023796920"],"corresponding_institution_ids":["https://openalex.org/I4210118299"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.1909405,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"4"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.41499999165534973,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.41499999165534973,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10350","display_name":"Electronic Health Records Systems","score":0.29319998621940613,"subfield":{"id":"https://openalex.org/subfields/3605","display_name":"Health Information Management"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.09960000216960907,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/harmonization","display_name":"Harmonization","score":0.8410000205039978},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.6326000094413757},{"id":"https://openalex.org/keywords/schema","display_name":"Schema (genetic algorithms)","score":0.5723999738693237},{"id":"https://openalex.org/keywords/standardization","display_name":"Standardization","score":0.5286999940872192},{"id":"https://openalex.org/keywords/electronic-health-record","display_name":"Electronic health record","score":0.40380001068115234},{"id":"https://openalex.org/keywords/health-care","display_name":"Health care","score":0.3666999936103821},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.35109999775886536},{"id":"https://openalex.org/keywords/snapshot","display_name":"Snapshot (computer storage)","score":0.32350000739097595}],"concepts":[{"id":"https://openalex.org/C2779962950","wikidata":"https://www.wikidata.org/wiki/Q5659376","display_name":"Harmonization","level":2,"score":0.8410000205039978},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6373999714851379},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.6326000094413757},{"id":"https://openalex.org/C52146309","wikidata":"https://www.wikidata.org/wiki/Q7431116","display_name":"Schema (genetic algorithms)","level":2,"score":0.5723999738693237},{"id":"https://openalex.org/C188087704","wikidata":"https://www.wikidata.org/wiki/Q369577","display_name":"Standardization","level":2,"score":0.5286999940872192},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.41260001063346863},{"id":"https://openalex.org/C3020144179","wikidata":"https://www.wikidata.org/wiki/Q10871684","display_name":"Electronic health record","level":3,"score":0.40380001068115234},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.38850000500679016},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3734000027179718},{"id":"https://openalex.org/C160735492","wikidata":"https://www.wikidata.org/wiki/Q31207","display_name":"Health care","level":2,"score":0.3666999936103821},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.35109999775886536},{"id":"https://openalex.org/C55282118","wikidata":"https://www.wikidata.org/wiki/Q252683","display_name":"Snapshot (computer storage)","level":2,"score":0.32350000739097595},{"id":"https://openalex.org/C3017977704","wikidata":"https://www.wikidata.org/wiki/Q18745135","display_name":"Health data","level":3,"score":0.319599986076355},{"id":"https://openalex.org/C72634772","wikidata":"https://www.wikidata.org/wiki/Q386824","display_name":"Data integration","level":2,"score":0.31869998574256897},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.31790000200271606},{"id":"https://openalex.org/C3019952477","wikidata":"https://www.wikidata.org/wiki/Q1324077","display_name":"Health records","level":3,"score":0.3156000077724457},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.3125},{"id":"https://openalex.org/C41525213","wikidata":"https://www.wikidata.org/wiki/Q690189","display_name":"Electronic data interchange","level":2,"score":0.3116999864578247},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.30480000376701355},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.30160000920295715},{"id":"https://openalex.org/C123201435","wikidata":"https://www.wikidata.org/wiki/Q456632","display_name":"Information privacy","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C180198813","wikidata":"https://www.wikidata.org/wiki/Q121182","display_name":"Information system","level":2,"score":0.2694000005722046},{"id":"https://openalex.org/C110615152","wikidata":"https://www.wikidata.org/wiki/Q1469824","display_name":"Controlled vocabulary","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C33762810","wikidata":"https://www.wikidata.org/wiki/Q461671","display_name":"Data integrity","level":2,"score":0.2680000066757202},{"id":"https://openalex.org/C80958533","wikidata":"https://www.wikidata.org/wiki/Q1047174","display_name":"Audit trail","level":3,"score":0.2678999900817871},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.26260000467300415},{"id":"https://openalex.org/C82578977","wikidata":"https://www.wikidata.org/wiki/Q16773055","display_name":"Data aggregator","level":3,"score":0.2556000053882599}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/dsaa65442.2025.11247965","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dsaa65442.2025.11247965","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 12th International Conference on Data Science and Advanced Analytics (DSAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G7070055613","display_name":null,"funder_award_id":"UM1TR004539","funder_id":"https://openalex.org/F4320332161","funder_display_name":"National Institutes of Health"}],"funders":[{"id":"https://openalex.org/F4320332161","display_name":"National Institutes of Health","ror":"https://ror.org/01cwqze88"},{"id":"https://openalex.org/F4320337472","display_name":"National Center for Advancing Translational Sciences","ror":"https://ror.org/04pw6fb54"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":2,"referenced_works":["https://openalex.org/W2396881363","https://openalex.org/W3202649559"],"related_works":[],"abstract_inverted_index":{"Electronic":[0],"health":[1,151],"record":[2],"(EHR)":[3],"datasets":[4,31,81],"come":[5],"in":[6],"various":[7],"schemas":[8],"and":[9,19,61,77,127,149,171],"can":[10,57],"contain":[11],"a":[12,33,70,83,99],"range":[13],"of":[14,28,79,91],"data":[15,112,118],"types,":[16],"measurement":[17],"units,":[18],"variables":[20],"that":[21,40,73,103,162],"share":[22],"duplicate":[23],"semantic":[24],"content.":[25],"The":[26],"process":[27,55],"bringing":[29],"such":[30],"into":[32],"common":[34],"schema":[35,105,131],"with":[36,88,116],"consistent":[37],"values,":[38],"so":[39,146],"it":[41],"is":[42,48,98],"possible":[43],"to":[44,63,114],"perform":[45],"queries":[46],"uniformly,":[47],"known":[49],"as":[50],"harmonization.":[51],"However,":[52],"performing":[53],"this":[54,66],"manually":[56],"be":[58],"both":[59,158],"time-consuming":[60],"prone":[62],"errors.":[64],"In":[65,121,133],"work,":[67],"we":[68,160],"present":[69],"web-based":[71],"platform":[72],"semi-automates":[74],"the":[75,89,122,134,143,169],"harmonization":[76,101,139,170],"linking":[78,172],"EHR":[80],"through":[82],"human-in-the-loop":[84],"framework,":[85],"guiding":[86],"users":[87,125],"use":[90],"large":[92],"language":[93],"models":[94],"(LLMs).":[95],"Our":[96],"solution":[97],"two-stage":[100],"pipeline":[102],"keeps":[104],"metadata":[106],"processing":[107],"online":[108],"while":[109],"handling":[110],"patient-level":[111],"locally,":[113],"align":[115],"HIPAA":[117],"privacy":[119],"principles.":[120],"first":[123],"stage,":[124,136],"harmonize":[126],"link":[128],"only":[129],"non-identifiable":[130],"information.":[132],"second":[135],"sensitive":[137],"value-level":[138],"occurs":[140],"entirely":[141],"on":[142],"user's":[144],"system,":[145],"no":[147],"private":[148],"protected":[150],"information":[152],"ever":[153],"leaves":[154],"their":[155],"environment.":[156],"Throughout":[157],"stages,":[159],"expect":[161],"LLM-powered":[163],"suggestions":[164],"could":[165],"potentially":[166],"speed":[167],"up":[168],"processes.":[173]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-11-25T00:00:00"}
