{"id":"https://openalex.org/W3119059031","doi":"https://doi.org/10.1145/3418036","title":"An Unsupervised Normalization Algorithm\u00a0for Noisy Text: A Case Study for Information Retrieval and Stance Detection","display_name":"An Unsupervised Normalization Algorithm\u00a0for Noisy Text: A Case Study for Information Retrieval and Stance Detection","publication_year":2021,"publication_date":"2021-04-27","ids":{"openalex":"https://openalex.org/W3119059031","doi":"https://doi.org/10.1145/3418036","mag":"3119059031"},"language":"en","primary_location":{"id":"doi:10.1145/3418036","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3418036","pdf_url":null,"source":{"id":"https://openalex.org/S110189822","display_name":"Journal of Data and Information Quality","issn_l":"1936-1955","issn":["1936-1955","1936-1963"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Data and Information Quality","raw_type":"journal-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2101.03303","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073133103","display_name":"Anurag Roy","orcid":"https://orcid.org/0000-0002-2097-9442"},"institutions":[{"id":"https://openalex.org/I145894827","display_name":"Indian Institute of Technology Kharagpur","ror":"https://ror.org/03w5sq511","country_code":"IN","type":"education","lineage":["https://openalex.org/I145894827"]}],"countries":["IN"],"is_corresponding":true,"raw_author_name":"Anurag Roy","raw_affiliation_strings":["Department of Computer Science and Engineering, Indian Institute of Technology Kharagpur, Kharagpur, India","Department of Computer Science and Engineering, Indian Institute of Technology, Kharagpur, Kharagpur, India#TAB#"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Indian Institute of Technology Kharagpur, Kharagpur, India","institution_ids":["https://openalex.org/I145894827"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, Indian Institute of Technology, Kharagpur, Kharagpur, India#TAB#","institution_ids":["https://openalex.org/I145894827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004544456","display_name":"Shalmoli Ghosh","orcid":null},"institutions":[{"id":"https://openalex.org/I145894827","display_name":"Indian Institute of Technology Kharagpur","ror":"https://ror.org/03w5sq511","country_code":"IN","type":"education","lineage":["https://openalex.org/I145894827"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Shalmoli Ghosh","raw_affiliation_strings":["Department of Computer Science and Engineering, Indian Institute of Technology Kharagpur, Kharagpur, India","Department of Computer Science and Engineering, Indian Institute of Technology, Kharagpur, Kharagpur, India#TAB#"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Indian Institute of Technology Kharagpur, Kharagpur, India","institution_ids":["https://openalex.org/I145894827"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, Indian Institute of Technology, Kharagpur, Kharagpur, India#TAB#","institution_ids":["https://openalex.org/I145894827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082286441","display_name":"Kripabandhu Ghosh","orcid":"https://orcid.org/0000-0002-8130-1221"},"institutions":[{"id":"https://openalex.org/I127439422","display_name":"Indian Institute of Science Education and Research Kolkata","ror":"https://ror.org/00djv2c17","country_code":"IN","type":"education","lineage":["https://openalex.org/I127439422"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Kripabandhu Ghosh","raw_affiliation_strings":["Department of Computer Science and Application, Indian Institute of Science Education and Research Kolkata, Mohanpur, India"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Application, Indian Institute of Science Education and Research Kolkata, Mohanpur, India","institution_ids":["https://openalex.org/I127439422"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5073748464","display_name":"Saptarshi Ghosh","orcid":"https://orcid.org/0000-0002-2306-300X"},"institutions":[{"id":"https://openalex.org/I145894827","display_name":"Indian Institute of Technology Kharagpur","ror":"https://ror.org/03w5sq511","country_code":"IN","type":"education","lineage":["https://openalex.org/I145894827"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Saptarshi Ghosh","raw_affiliation_strings":["Department of Computer Science and Engineering, Indian Institute of Technology Kharagpur, Kharagpur, India","Department of Computer Science and Engineering, Indian Institute of Technology, Kharagpur, Kharagpur, India#TAB#"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Indian Institute of Technology Kharagpur, Kharagpur, India","institution_ids":["https://openalex.org/I145894827"]},{"raw_affiliation_string":"Department of Computer Science and Engineering, Indian Institute of Technology, Kharagpur, Kharagpur, India#TAB#","institution_ids":["https://openalex.org/I145894827"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5073133103"],"corresponding_institution_ids":["https://openalex.org/I145894827"],"apc_list":null,"apc_paid":null,"fwci":0.42,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.672327,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":"13","issue":"3","first_page":"1","last_page":"25"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.8747828006744385},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7895376682281494},{"id":"https://openalex.org/keywords/microblogging","display_name":"Microblogging","score":0.5312775373458862},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5219501852989197},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.46246808767318726},{"id":"https://openalex.org/keywords/noisy-data","display_name":"Noisy data","score":0.4399181604385376},{"id":"https://openalex.org/keywords/social-media","display_name":"Social media","score":0.41095733642578125},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.40555334091186523},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3682437539100647},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.34998974204063416}],"concepts":[{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.8747828006744385},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7895376682281494},{"id":"https://openalex.org/C143275388","wikidata":"https://www.wikidata.org/wiki/Q92438","display_name":"Microblogging","level":3,"score":0.5312775373458862},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5219501852989197},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.46246808767318726},{"id":"https://openalex.org/C2781170535","wikidata":"https://www.wikidata.org/wiki/Q30587856","display_name":"Noisy data","level":2,"score":0.4399181604385376},{"id":"https://openalex.org/C518677369","wikidata":"https://www.wikidata.org/wiki/Q202833","display_name":"Social media","level":2,"score":0.41095733642578125},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.40555334091186523},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3682437539100647},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34998974204063416},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C19165224","wikidata":"https://www.wikidata.org/wiki/Q23404","display_name":"Anthropology","level":1,"score":0.0},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1145/3418036","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3418036","pdf_url":null,"source":{"id":"https://openalex.org/S110189822","display_name":"Journal of Data and Information Quality","issn_l":"1936-1955","issn":["1936-1955","1936-1963"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Data and Information Quality","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2101.03303","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2101.03303","pdf_url":"https://arxiv.org/pdf/2101.03303","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:3119059031","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2101.03303.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2101.03303","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2101.03303","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2101.03303","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2101.03303","pdf_url":"https://arxiv.org/pdf/2101.03303","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"score":0.8500000238418579,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"},{"id":"https://openalex.org/F4320320712","display_name":"Indian Institute of Technology Kanpur","ror":"https://ror.org/05pjsgx75"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W3119059031.pdf"},"referenced_works_count":36,"referenced_works":["https://openalex.org/W1482504747","https://openalex.org/W1520449809","https://openalex.org/W1868971014","https://openalex.org/W1995672491","https://openalex.org/W2000447173","https://openalex.org/W2027687511","https://openalex.org/W2033523543","https://openalex.org/W2057900969","https://openalex.org/W2064675550","https://openalex.org/W2092911974","https://openalex.org/W2122254380","https://openalex.org/W2127048411","https://openalex.org/W2131681506","https://openalex.org/W2141599568","https://openalex.org/W2144226312","https://openalex.org/W2146867136","https://openalex.org/W2151936673","https://openalex.org/W2153579005","https://openalex.org/W2250293945","https://openalex.org/W2250423567","https://openalex.org/W2251237590","https://openalex.org/W2252107166","https://openalex.org/W2404228453","https://openalex.org/W2460159515","https://openalex.org/W2604868928","https://openalex.org/W2767329425","https://openalex.org/W2768030591","https://openalex.org/W2774974668","https://openalex.org/W2783821110","https://openalex.org/W2911944143","https://openalex.org/W2946365616","https://openalex.org/W2963010813","https://openalex.org/W2963151059","https://openalex.org/W2999282340","https://openalex.org/W3004975108","https://openalex.org/W6631368335"],"related_works":["https://openalex.org/W2776429271","https://openalex.org/W3164120900","https://openalex.org/W3202787371","https://openalex.org/W3055227258","https://openalex.org/W2967052791","https://openalex.org/W3134061603","https://openalex.org/W3168378459","https://openalex.org/W3118428065","https://openalex.org/W3091088803","https://openalex.org/W3035271324","https://openalex.org/W3194859544","https://openalex.org/W2513638114","https://openalex.org/W3090840755","https://openalex.org/W3174485549","https://openalex.org/W2979707486","https://openalex.org/W2982156428","https://openalex.org/W3122930709","https://openalex.org/W3091983920","https://openalex.org/W2791153761","https://openalex.org/W3207527868"],"abstract_inverted_index":{"A":[0],"large":[1,90],"fraction":[2],"of":[3,11,26,59,77,92],"textual":[4],"data":[5,94],"available":[6,45],"today":[7],"contains":[8],"various":[9],"types":[10],"\u201cnoise,\u201d":[12],"such":[13,37],"as":[14,38,153],"OCR":[15],"noise":[16,20,60],"in":[17,61],"digitized":[18],"documents,":[19],"due":[21],"to":[22,98,121,155],"informal":[23],"writing":[24],"style":[25],"users":[27],"on":[28],"microblogging":[29],"sites,":[30],"and":[31,40,85,126,131,150],"so":[32],"on.":[33],"To":[34],"enable":[35],"tasks":[36],"search/retrieval":[39],"classification":[41],"over":[42,123,135],"all":[43],"the":[44,62,78,144],"data,":[46],"we":[47],"need":[48,111],"robust":[49],"algorithms":[50],"for":[51,55,105],"text":[52,80,106,122,141,160],"normalization,":[53],"i.e.,":[54],"cleaning":[56,70],"different":[57,124],"kinds":[58],"text.":[63],"There":[64],"have":[65],"been":[66],"several":[67,136,158],"efforts":[68],"towards":[69],"or":[71,89],"normalizing":[72],"noisy":[73],"text;":[74],"however,":[75],"many":[76],"existing":[79],"normalization":[81,107,142,161],"methods":[82],"are":[83],"supervised":[84],"require":[86],"language-dependent":[87],"resources":[88],"amounts":[91],"training":[93,113],"that":[95,108,140,156],"is":[96,119],"difficult":[97],"obtain.":[99],"We":[100],"propose":[101],"an":[102],"unsupervised":[103],"algorithm":[104,118,146],"does":[109],"not":[110],"any":[112],"data/human":[114],"intervention.":[115],"The":[116],"proposed":[117,145],"applicable":[120],"languages":[125],"can":[127],"handle":[128],"both":[129],"machine-generated":[130],"human-generated":[132],"noise.":[133],"Experiments":[134],"standard":[137],"datasets":[138],"show":[139],"through":[143],"enables":[147],"better":[148],"retrieval":[149],"stance":[151],"detection,":[152],"compared":[154],"using":[157],"baseline":[159],"methods.":[162]},"counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":1}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
