{"id":"https://openalex.org/W3008511741","doi":"https://doi.org/10.1109/bigdata47090.2019.9006492","title":"Discovering Sublanguages in a Large Clinical Corpus through Unsupervised Machine Learning and Information Gain","display_name":"Discovering Sublanguages in a Large Clinical Corpus through Unsupervised Machine Learning and Information Gain","publication_year":2019,"publication_date":"2019-12-01","ids":{"openalex":"https://openalex.org/W3008511741","doi":"https://doi.org/10.1109/bigdata47090.2019.9006492","mag":"3008511741"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata47090.2019.9006492","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata47090.2019.9006492","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5034770262","display_name":"T. Elizabeth Workman","orcid":"https://orcid.org/0000-0002-2273-078X"},"institutions":[{"id":"https://openalex.org/I193531525","display_name":"George Washington University","ror":"https://ror.org/00y4zzh67","country_code":"US","type":"education","lineage":["https://openalex.org/I193531525"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"T. Elizabeth Workman","raw_affiliation_strings":["Biomedical Informatics Center, The George Washington University, Washington, D.C., U.S.A"],"affiliations":[{"raw_affiliation_string":"Biomedical Informatics Center, The George Washington University, Washington, D.C., U.S.A","institution_ids":["https://openalex.org/I193531525"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007564242","display_name":"Guy Divita","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guy Divita","raw_affiliation_strings":["Division of Epidemiology, The University of Utah, Salt Lake City, UT, U.S.A"],"affiliations":[{"raw_affiliation_string":"Division of Epidemiology, The University of Utah, Salt Lake City, UT, U.S.A","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5058747294","display_name":"Qing Zeng\u2010Treitler","orcid":"https://orcid.org/0000-0002-8353-7473"},"institutions":[{"id":"https://openalex.org/I193531525","display_name":"George Washington University","ror":"https://ror.org/00y4zzh67","country_code":"US","type":"education","lineage":["https://openalex.org/I193531525"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Qing Zeng-Treitler","raw_affiliation_strings":["Biomedical Informatics Center, The George Washington University, Washington, D.C., U.S.A"],"affiliations":[{"raw_affiliation_string":"Biomedical Informatics Center, The George Washington University, Washington, D.C., U.S.A","institution_ids":["https://openalex.org/I193531525"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5034770262"],"corresponding_institution_ids":["https://openalex.org/I193531525"],"apc_list":null,"apc_paid":null,"fwci":0.5601,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.76371771,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"4889","last_page":"4898"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12881","display_name":"linguistics and terminology studies","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9904000163078308,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sublanguage","display_name":"Sublanguage","score":0.9361560344696045},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7736318111419678},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.7385232448577881},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.722958505153656},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.555779218673706},{"id":"https://openalex.org/keywords/entropy","display_name":"Entropy (arrow of time)","score":0.4233975112438202},{"id":"https://openalex.org/keywords/unsupervised-learning","display_name":"Unsupervised learning","score":0.41319942474365234},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.4118972420692444}],"concepts":[{"id":"https://openalex.org/C2776411971","wikidata":"https://www.wikidata.org/wiki/Q17141398","display_name":"Sublanguage","level":2,"score":0.9361560344696045},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7736318111419678},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7385232448577881},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.722958505153656},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.555779218673706},{"id":"https://openalex.org/C106301342","wikidata":"https://www.wikidata.org/wiki/Q4117933","display_name":"Entropy (arrow of time)","level":2,"score":0.4233975112438202},{"id":"https://openalex.org/C8038995","wikidata":"https://www.wikidata.org/wiki/Q1152135","display_name":"Unsupervised learning","level":2,"score":0.41319942474365234},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.4118972420692444},{"id":"https://openalex.org/C59822182","wikidata":"https://www.wikidata.org/wiki/Q441","display_name":"Botany","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/bigdata47090.2019.9006492","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata47090.2019.9006492","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5299999713897705}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W1367335","https://openalex.org/W28820520","https://openalex.org/W80463681","https://openalex.org/W144923133","https://openalex.org/W587150375","https://openalex.org/W1548184736","https://openalex.org/W1965555277","https://openalex.org/W1971954596","https://openalex.org/W1985297116","https://openalex.org/W1995875735","https://openalex.org/W2002514548","https://openalex.org/W2015509928","https://openalex.org/W2022817031","https://openalex.org/W2049810512","https://openalex.org/W2082258771","https://openalex.org/W2093157872","https://openalex.org/W2115871101","https://openalex.org/W2124077863","https://openalex.org/W2127314673","https://openalex.org/W2135479785","https://openalex.org/W2142562230","https://openalex.org/W2162311237","https://openalex.org/W2165308817","https://openalex.org/W2335231222","https://openalex.org/W2415814795","https://openalex.org/W2463713401","https://openalex.org/W4252713891","https://openalex.org/W6600063700","https://openalex.org/W6601159111","https://openalex.org/W6603158870","https://openalex.org/W6605982339","https://openalex.org/W6632812989","https://openalex.org/W6677115753","https://openalex.org/W6702984405","https://openalex.org/W6716223255"],"related_works":["https://openalex.org/W2086840550","https://openalex.org/W1995535451","https://openalex.org/W199984035","https://openalex.org/W3139245306","https://openalex.org/W2118701173","https://openalex.org/W29691679","https://openalex.org/W2008737885","https://openalex.org/W2114861230","https://openalex.org/W2131073441","https://openalex.org/W1972357705"],"abstract_inverted_index":{"Sublanguages":[0],"are":[1,76],"domain-centered":[2],"subsets":[3],"of":[4,32,39],"general":[5],"or":[6],"colloquial":[7],"language.":[8],"Their":[9],"identification":[10],"drives":[11],"several":[12],"language":[13],"analysis":[14],"tasks,":[15],"but":[16],"it":[17],"is":[18],"difficult":[19],"to":[20,47],"discern":[21],"separate":[22],"sublanguages":[23,49],"in":[24,61,78],"large":[25,52],"clinical":[26,53],"corpora.":[27],"We":[28],"applied":[29],"k-means":[30],"clustering":[31],"semantic":[33],"properties,":[34],"and":[35,68,81,87],"a":[36,51,62],"novel":[37],"implementation":[38],"relative":[40],"entropy":[41],"as":[42],"an":[43],"information":[44],"gain":[45],"indicator,":[46],"identify":[48],"within":[50,67],"corpus":[54],"(~1.6":[55],"million":[56],"documents),":[57],"visualizing":[58],"the":[59],"results":[60],"heat":[63],"map.":[64],"Patterns":[65],"both":[66,85],"across":[69],"clusters":[70],"reveal":[71],"sublanguage":[72,79],"trends.":[73],"These":[74],"findings":[75],"significant":[77],"analysis,":[80],"have":[82],"implications":[83],"on":[84],"regional":[86],"international":[88],"levels.":[89]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2},{"year":2020,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
