{"id":"https://openalex.org/W2146408445","doi":"https://doi.org/10.1186/1471-2105-14-10","title":"Redundancy in electronic health record corpora: analysis, impact on text mining performance and mitigation strategies","display_name":"Redundancy in electronic health record corpora: analysis, impact on text mining performance and mitigation strategies","publication_year":2013,"publication_date":"2013-01-16","ids":{"openalex":"https://openalex.org/W2146408445","doi":"https://doi.org/10.1186/1471-2105-14-10","mag":"2146408445"},"language":"en","primary_location":{"id":"doi:10.1186/1471-2105-14-10","is_oa":true,"landing_page_url":"https://doi.org/10.1186/1471-2105-14-10","pdf_url":"https://bmcbioinformatics.biomedcentral.com/counter/pdf/10.1186/1471-2105-14-10","source":{"id":"https://openalex.org/S19032547","display_name":"BMC Bioinformatics","issn_l":"1471-2105","issn":["1471-2105"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"BMC Bioinformatics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://bmcbioinformatics.biomedcentral.com/counter/pdf/10.1186/1471-2105-14-10","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5003287953","display_name":"Raphael Cohen","orcid":null},"institutions":[{"id":"https://openalex.org/I124227911","display_name":"Ben-Gurion University of the Negev","ror":"https://ror.org/05tkyf982","country_code":"IL","type":"education","lineage":["https://openalex.org/I124227911"]}],"countries":["IL"],"is_corresponding":true,"raw_author_name":"Raphael Cohen","raw_affiliation_strings":["Department of Computer Science, Ben-Gurion University in the Negev, Beer-Sheva, Israel"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Ben-Gurion University in the Negev, Beer-Sheva, Israel","institution_ids":["https://openalex.org/I124227911"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013658527","display_name":"Michael Elhadad","orcid":"https://orcid.org/0000-0002-5629-2351"},"institutions":[{"id":"https://openalex.org/I124227911","display_name":"Ben-Gurion University of the Negev","ror":"https://ror.org/05tkyf982","country_code":"IL","type":"education","lineage":["https://openalex.org/I124227911"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Michael Elhadad","raw_affiliation_strings":["Department of Computer Science, Ben-Gurion University in the Negev, Beer-Sheva, Israel"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Ben-Gurion University in the Negev, Beer-Sheva, Israel","institution_ids":["https://openalex.org/I124227911"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5047270546","display_name":"No\u00e9mie Elhadad","orcid":"https://orcid.org/0000-0001-9721-5240"},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"No\u00e9mie Elhadad","raw_affiliation_strings":["Department of Biomedical Informatics, Columbia University, New York, NY, USA","Department of Biomedical Informatics, Columbia University, New York, USA"],"affiliations":[{"raw_affiliation_string":"Department of Biomedical Informatics, Columbia University, New York, NY, USA","institution_ids":["https://openalex.org/I78577930"]},{"raw_affiliation_string":"Department of Biomedical Informatics, Columbia University, New York, USA","institution_ids":["https://openalex.org/I78577930"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5003287953"],"corresponding_institution_ids":["https://openalex.org/I124227911"],"apc_list":{"value":1690,"currency":"GBP","value_usd":2072},"apc_paid":{"value":1690,"currency":"GBP","value_usd":2072},"fwci":5.3815,"has_fulltext":true,"cited_by_count":120,"citation_normalized_percentile":{"value":0.96594897,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":100},"biblio":{"volume":"14","issue":"1","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9869999885559082,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.8122179508209229},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7478066682815552},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.49845337867736816},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.43362730741500854},{"id":"https://openalex.org/keywords/text-mining","display_name":"Text mining","score":0.42326363921165466},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.40204474329948425},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.36674565076828003},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.35038796067237854},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.092399001121521}],"concepts":[{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.8122179508209229},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7478066682815552},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.49845337867736816},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.43362730741500854},{"id":"https://openalex.org/C71472368","wikidata":"https://www.wikidata.org/wiki/Q676880","display_name":"Text mining","level":2,"score":0.42326363921165466},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40204474329948425},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.36674565076828003},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.35038796067237854},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.092399001121521},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1186/1471-2105-14-10","is_oa":true,"landing_page_url":"https://doi.org/10.1186/1471-2105-14-10","pdf_url":"https://bmcbioinformatics.biomedcentral.com/counter/pdf/10.1186/1471-2105-14-10","source":{"id":"https://openalex.org/S19032547","display_name":"BMC Bioinformatics","issn_l":"1471-2105","issn":["1471-2105"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"BMC Bioinformatics","raw_type":"journal-article"},{"id":"pmh:oai:europepmc.org:2597623","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/3599108","pdf_url":null,"source":{"id":"https://openalex.org/S4306400806","display_name":"Europe PMC (PubMed Central)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1303153112","host_organization_name":"European Bioinformatics Institute","host_organization_lineage":["https://openalex.org/I1303153112"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1186/1471-2105-14-10","is_oa":true,"landing_page_url":"https://doi.org/10.1186/1471-2105-14-10","pdf_url":"https://bmcbioinformatics.biomedcentral.com/counter/pdf/10.1186/1471-2105-14-10","source":{"id":"https://openalex.org/S19032547","display_name":"BMC Bioinformatics","issn_l":"1471-2105","issn":["1471-2105"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320256","host_organization_name":"BioMed Central","host_organization_lineage":["https://openalex.org/P4310320256","https://openalex.org/P4310319965"],"host_organization_lineage_names":["BioMed Central","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"BMC Bioinformatics","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320337372","display_name":"U.S. National Library of Medicine","ror":"https://ror.org/0060t0j89"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2146408445.pdf","grobid_xml":"https://content.openalex.org/works/W2146408445.grobid-xml"},"referenced_works_count":62,"referenced_works":["https://openalex.org/W15359877","https://openalex.org/W27189800","https://openalex.org/W192665053","https://openalex.org/W199541590","https://openalex.org/W200246157","https://openalex.org/W256669079","https://openalex.org/W323540269","https://openalex.org/W1504212872","https://openalex.org/W1529075208","https://openalex.org/W1552325775","https://openalex.org/W1558365920","https://openalex.org/W1574901103","https://openalex.org/W1603889964","https://openalex.org/W1605173773","https://openalex.org/W1651266083","https://openalex.org/W1880262756","https://openalex.org/W1919152067","https://openalex.org/W1965745145","https://openalex.org/W1970891733","https://openalex.org/W1978928459","https://openalex.org/W1979794499","https://openalex.org/W1981276685","https://openalex.org/W2010762993","https://openalex.org/W2014516359","https://openalex.org/W2018989507","https://openalex.org/W2032503600","https://openalex.org/W2034852786","https://openalex.org/W2055043387","https://openalex.org/W2057913811","https://openalex.org/W2062296203","https://openalex.org/W2068017609","https://openalex.org/W2074774389","https://openalex.org/W2075137165","https://openalex.org/W2100639921","https://openalex.org/W2102150019","https://openalex.org/W2103018059","https://openalex.org/W2103587173","https://openalex.org/W2106618845","https://openalex.org/W2110518760","https://openalex.org/W2117278770","https://openalex.org/W2120831232","https://openalex.org/W2121920862","https://openalex.org/W2126866831","https://openalex.org/W2134632846","https://openalex.org/W2141885858","https://openalex.org/W2142071150","https://openalex.org/W2144100511","https://openalex.org/W2144406616","https://openalex.org/W2152224036","https://openalex.org/W2154652894","https://openalex.org/W2158598845","https://openalex.org/W2158714788","https://openalex.org/W2159583324","https://openalex.org/W2160194473","https://openalex.org/W2163302275","https://openalex.org/W2170521549","https://openalex.org/W2282054059","https://openalex.org/W2593530951","https://openalex.org/W2597289420","https://openalex.org/W4230872509","https://openalex.org/W4252166161","https://openalex.org/W6642952223"],"related_works":["https://openalex.org/W2389214306","https://openalex.org/W2965083567","https://openalex.org/W4235240664","https://openalex.org/W1838576100","https://openalex.org/W2757182831","https://openalex.org/W2095886385","https://openalex.org/W2089704382","https://openalex.org/W1983399550","https://openalex.org/W97075385","https://openalex.org/W3151146928"],"abstract_inverted_index":{"The":[0],"increasing":[1],"availability":[2],"of":[3,151,161,178,188,194,203,219,228,294,301],"Electronic":[4],"Health":[5],"Record":[6],"(EHR)":[7],"data":[8,224,302,337],"and":[9,33,44,64,148,172,180,191,197,212,230,313,316],"specifically":[10],"free-text":[11],"patient":[12,74,80,256],"notes":[13,69,263],"presents":[14],"opportunities":[15],"for":[16,254,307],"phenotype":[17],"extraction.":[18],"Text-mining":[19],"methods":[20,221],"in":[21,51,102,116,176,257,338],"particular":[22],"can":[23,98,156,325],"help":[24],"disease":[25],"modeling":[26],"by":[27,145,347],"mapping":[28],"named-entities":[29],"mentions":[30],"to":[31,84,241,291,334],"terminologies":[32],"clustering":[34],"semantically":[35],"related":[36],"terms.":[37],"EHR":[38,124,170,274,340],"corpora,":[39],"however,":[40],"exhibit":[41],"specific":[42],"statistical":[43],"linguistic":[45],"characteristics":[46,310],"when":[47,70],"compared":[48],"with":[49,225,264,276],"corpora":[50,112],"the":[52,122,141,152,159,201,217,251,258,273,292,295,299,339,344],"biomedical":[53],"literature":[54],"domain.":[55],"We":[56,166,184,199,215],"focus":[57],"on":[58,163,205,222],"copy-and-paste":[59],"redundancy:":[60],"clinicians":[61],"typically":[62],"copy":[63],"paste":[65],"information":[66],"from":[67],"previous":[68],"documenting":[71],"a":[72,78,133,168,246],"current":[73],"encounter.":[75],"Thus,":[76],"within":[77],"longitudinal":[79],"record,":[81],"one":[82,157,286],"expects":[83],"observe":[85,185,231],"heavy":[86],"redundancy.":[87,348],"In":[88],"this":[89],"paper,":[90],"we":[91,236],"ask":[92],"three":[93],"research":[94],"questions:":[95],"(i)":[96,245],"How":[97,155],"redundancy":[99,125,131,142,162,174,186,204,229],"be":[100],"quantified":[101],"large-scale":[103,169],"text":[104,117,127,164,270,309,328],"corpora?":[105],"(ii)":[106,260],"Conventional":[107],"wisdom":[108],"is":[109],"that":[110,135],"larger":[111],"yield":[113],"better":[114,280],"results":[115,218],"mining.":[118,329],"But":[119],"how":[120],"does":[121,140],"observed":[123],"affect":[126],"mining?":[128,165],"Does":[129],"such":[130,320],"introduce":[132,143],"bias":[134,345],"distorts":[136],"learned":[137],"models?":[138],"Or":[139],"benefits":[144],"highlighting":[146],"stable":[147],"important":[149],"subsets":[150],"corpus?":[153],"(iii)":[154],"mitigate":[158],"impact":[160,202],"analyze":[167],"corpus":[171,275,318],"quantify":[173],"both":[175,195],"terms":[177],"word":[179],"semantic":[181],"concept":[182],"repetition.":[183],"levels":[187,227],"about":[189],"30%":[190],"non-standard":[192],"distribution":[193],"words":[196],"concepts.":[198],"measure":[200],"two":[206,238],"standard":[207],"text-mining":[208,284,332],"applications:":[209],"collocation":[210],"identification":[211],"topic":[213],"modeling.":[214],"compare":[216,237],"these":[220],"synthetic":[223],"controlled":[226],"significant":[232],"performance":[233],"variation.":[234],"Finally,":[235],"mitigation":[239],"strategies":[240],"avoid":[242],"redundancy-induced":[243],"bias:":[244],"baseline":[247],"strategy,":[248],"keeping":[249],"only":[250],"last":[252],"note":[253],"each":[255],"corpus;":[259],"removing":[261],"redundant":[262],"an":[265],"efficient":[266],"fingerprinting-based":[267],"algorithm.":[268],"aFor":[269],"mining,":[271],"preprocessing":[272],"fingerprinting":[277],"yields":[278],"significantly":[279],"results.":[281],"Before":[282],"applying":[283],"techniques,":[285],"must":[287],"pay":[288],"careful":[289],"attention":[290],"structure":[293],"analyzed":[296],"corpora.":[297],"While":[298],"importance":[300],"cleaning":[303],"has":[304],"been":[305],"known":[306],"low-level":[308],"(e.g.,":[311],"encoding":[312],"spelling),":[314],"high-level":[315],"difficult-to-quantify":[317],"characteristics,":[319],"as":[321],"naturally":[322],"occurring":[323],"redundancy,":[324],"also":[326],"hurt":[327],"Fingerprinting":[330],"enables":[331],"techniques":[333],"leverage":[335],"available":[336],"corpus,":[341],"while":[342],"avoiding":[343],"introduced":[346]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":12},{"year":2023,"cited_by_count":6},{"year":2022,"cited_by_count":14},{"year":2021,"cited_by_count":7},{"year":2020,"cited_by_count":10},{"year":2019,"cited_by_count":10},{"year":2018,"cited_by_count":8},{"year":2017,"cited_by_count":9},{"year":2016,"cited_by_count":8},{"year":2015,"cited_by_count":18},{"year":2014,"cited_by_count":8},{"year":2013,"cited_by_count":2},{"year":2012,"cited_by_count":1}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
