{"id":"https://openalex.org/W4403278801","doi":"https://doi.org/10.1109/is61756.2024.10705199","title":"An Efficient Text Cleaning Pipeline for Clinical Text for Transformer Encoder Models","display_name":"An Efficient Text Cleaning Pipeline for Clinical Text for Transformer Encoder Models","publication_year":2024,"publication_date":"2024-08-29","ids":{"openalex":"https://openalex.org/W4403278801","doi":"https://doi.org/10.1109/is61756.2024.10705199"},"language":"en","primary_location":{"id":"doi:10.1109/is61756.2024.10705199","is_oa":false,"landing_page_url":"https://doi.org/10.1109/is61756.2024.10705199","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 12th International Conference on Intelligent Systems (IS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108184241","display_name":"Shahriyar Zaman Ridoy","orcid":null},"institutions":[{"id":"https://openalex.org/I157386601","display_name":"North South University","ror":"https://ror.org/05wdbfp45","country_code":"BD","type":"education","lineage":["https://openalex.org/I157386601"]}],"countries":["BD"],"is_corresponding":true,"raw_author_name":"Shahriyar Zaman Ridoy","raw_affiliation_strings":["North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh"],"affiliations":[{"raw_affiliation_string":"North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh","institution_ids":["https://openalex.org/I157386601"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114243513","display_name":"Jannat Sultana","orcid":null},"institutions":[{"id":"https://openalex.org/I157386601","display_name":"North South University","ror":"https://ror.org/05wdbfp45","country_code":"BD","type":"education","lineage":["https://openalex.org/I157386601"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Jannat Sultana","raw_affiliation_strings":["North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh"],"affiliations":[{"raw_affiliation_string":"North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh","institution_ids":["https://openalex.org/I157386601"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108184242","display_name":"Zinnat Fowzia Ria","orcid":null},"institutions":[{"id":"https://openalex.org/I157386601","display_name":"North South University","ror":"https://ror.org/05wdbfp45","country_code":"BD","type":"education","lineage":["https://openalex.org/I157386601"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Zinnat Fowzia Ria","raw_affiliation_strings":["North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh"],"affiliations":[{"raw_affiliation_string":"North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh","institution_ids":["https://openalex.org/I157386601"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5105792932","display_name":"Mohammed Arif Uddin","orcid":null},"institutions":[{"id":"https://openalex.org/I157386601","display_name":"North South University","ror":"https://ror.org/05wdbfp45","country_code":"BD","type":"education","lineage":["https://openalex.org/I157386601"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Mohammed Arif Uddin","raw_affiliation_strings":["North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh"],"affiliations":[{"raw_affiliation_string":"North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh","institution_ids":["https://openalex.org/I157386601"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070251212","display_name":"Md Hasibur Rahman","orcid":"https://orcid.org/0009-0001-6712-9461"},"institutions":[{"id":"https://openalex.org/I157386601","display_name":"North South University","ror":"https://ror.org/05wdbfp45","country_code":"BD","type":"education","lineage":["https://openalex.org/I157386601"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Md Hasibur Rahman","raw_affiliation_strings":["North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh"],"affiliations":[{"raw_affiliation_string":"North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh","institution_ids":["https://openalex.org/I157386601"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5071303493","display_name":"Rashedur M. Rahman","orcid":"https://orcid.org/0000-0002-4514-6279"},"institutions":[{"id":"https://openalex.org/I157386601","display_name":"North South University","ror":"https://ror.org/05wdbfp45","country_code":"BD","type":"education","lineage":["https://openalex.org/I157386601"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Rashedur M. Rahman","raw_affiliation_strings":["North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh"],"affiliations":[{"raw_affiliation_string":"North South University,Department of Electrical and Computer Engineering,Dhaka,Bangladesh","institution_ids":["https://openalex.org/I157386601"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5108184241"],"corresponding_institution_ids":["https://openalex.org/I157386601"],"apc_list":null,"apc_paid":null,"fwci":2.9575,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.92855587,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11450","display_name":"Model-Driven Software Engineering Techniques","score":0.5062999725341797,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11450","display_name":"Model-Driven Software Engineering Techniques","score":0.5062999725341797,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.7392435669898987},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7178047895431519},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6697072982788086},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6344214677810669},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3291126489639282},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.17950144410133362},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.16657593846321106},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.12731173634529114},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.12353476881980896},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.09433600306510925}],"concepts":[{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.7392435669898987},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7178047895431519},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6697072982788086},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6344214677810669},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3291126489639282},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.17950144410133362},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.16657593846321106},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.12731173634529114},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.12353476881980896},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.09433600306510925}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/is61756.2024.10705199","is_oa":false,"landing_page_url":"https://doi.org/10.1109/is61756.2024.10705199","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 12th International Conference on Intelligent Systems (IS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.5899999737739563,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W2396881363","https://openalex.org/W2911489562","https://openalex.org/W3021743785","https://openalex.org/W3132259035","https://openalex.org/W3167343366","https://openalex.org/W4283785590","https://openalex.org/W4320913850","https://openalex.org/W4390141881","https://openalex.org/W6761672038"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W4402327032","https://openalex.org/W2382290278"],"abstract_inverted_index":{"It":[0],"might":[1],"be":[2],"challenging":[3],"to":[4,20,61,111,116,141,197,216],"choose":[5],"the":[6,12,21,27,45,127,156,160,163,172,207],"best":[7],"text":[8,53,128],"preprocessing":[9,35,80],"strategy":[10],"in":[11],"field":[13],"of":[14,23,29],"natural":[15],"language":[16],"processing":[17],"(NLP)":[18],"due":[19],"variety":[22],"techniques":[24,81,108],"available.":[25],"Given":[26],"popularity":[28],"transformer":[30],"models,":[31],"we":[32,201],"wondered":[33],"if":[34,39],"was":[36,60],"necessary":[37],"and,":[38],"so,":[40],"what":[41],"methods":[42],"would":[43],"improve":[44,117],"models'":[46],"performance.":[47,75],"Especially":[48],"when":[49],"working":[50],"with":[51,122,129],"clinical":[52,68],"data,":[54],"accuracy":[55,104],"is":[56],"crucial.":[57],"Our":[58,119,149],"goal":[59],"find":[62],"an":[63],"appropriate":[64],"pre-processing":[65],"pipeline":[66,115,120,154,210],"for":[67,171],"texts":[69],"that":[70,152,182],"maintains":[71],"or":[72,193],"improves":[73],"model":[74,166],"We":[76,92,179],"experienced":[77],"four":[78,94],"common":[79],"and":[82,90,100,133,146,170],"their":[83],"groupings":[84],"on":[85],"two":[86],"datasets":[87],"from":[88,106,206],"MIMIC-3":[89,161],"PubMed.":[91],"used":[93],"models:":[95],"BERT":[96,164],"base,":[97],"BioBERT,":[98],"BioClinicalBERT,":[99],"RoBERTa.":[101],"The":[102],"varied":[103],"results":[105,150],"existing":[107],"inspired":[109],"us":[110],"develop":[112],"a":[113,130],"new":[114],"accuracy.":[118,178],"starts":[121],"removing":[123,183],"repeated":[124],"punctuation,":[125],"normalizing":[126],"CleanText":[131],"function,":[132],"filtering":[134],"less":[135,203],"important":[136,204],"words":[137,185,205],"using":[138,189],"TF-IDF":[139,190],"scores":[140],"keep":[142],"clinically":[143],"applicable":[144],"terms":[145],"moderate":[147],"noise.":[148],"presented":[151],"our":[153,209],"outperformed":[155],"base":[157,165],"models.":[158],"For":[159],"dataset,":[162,174],"achieved":[167,176],"90.16%":[168],"accuracy,":[169,187],"PubMed":[173],"BioBERT":[175],"64.20%":[177],"also":[180],"found":[181],"stop":[184],"decreased":[186],"while":[188],"either":[191],"maintained":[192],"improved":[194],"it":[195],"up":[196,215],"3%.":[198],"Additionally,":[199],"as":[200],"removed":[202],"documents":[208],"considerably":[211],"reduced":[212],"training":[213],"time":[214],"17%.":[217]},"counts_by_year":[{"year":2025,"cited_by_count":6}],"updated_date":"2026-02-27T16:54:17.756197","created_date":"2025-10-10T00:00:00"}
