{"id":"https://openalex.org/W2760736794","doi":"https://doi.org/10.18653/v1/w17-4416","title":"Improving Document Clustering by Removing Unnatural Language","display_name":"Improving Document Clustering by Removing Unnatural Language","publication_year":2017,"publication_date":"2017-01-01","ids":{"openalex":"https://openalex.org/W2760736794","doi":"https://doi.org/10.18653/v1/w17-4416","mag":"2760736794"},"language":"en","primary_location":{"id":"doi:10.18653/v1/w17-4416","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/w17-4416","pdf_url":"https://www.aclweb.org/anthology/W17-4416.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 3rd Workshop on Noisy User-generated Text","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.aclweb.org/anthology/W17-4416.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023862494","display_name":"Myungha Jang","orcid":null},"institutions":[{"id":"https://openalex.org/I33434090","display_name":"University of Massachusetts Boston","ror":"https://ror.org/04ydmy275","country_code":"US","type":"education","lineage":["https://openalex.org/I33434090"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Myungha Jang","raw_affiliation_strings":["College of Information and Computer Sciences, University of Massachusetts"],"affiliations":[{"raw_affiliation_string":"College of Information and Computer Sciences, University of Massachusetts","institution_ids":["https://openalex.org/I33434090"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101829031","display_name":"Jinho D. Choi","orcid":"https://orcid.org/0000-0003-2693-6934"},"institutions":[{"id":"https://openalex.org/I150468666","display_name":"Emory University","ror":"https://ror.org/03czfpz43","country_code":"US","type":"education","lineage":["https://openalex.org/I150468666"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinho D. Choi","raw_affiliation_strings":["Department of Computer Science, Emory University"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Emory University","institution_ids":["https://openalex.org/I150468666"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5034070218","display_name":"James Allan","orcid":"https://orcid.org/0000-0003-0132-5694"},"institutions":[{"id":"https://openalex.org/I33434090","display_name":"University of Massachusetts Boston","ror":"https://ror.org/04ydmy275","country_code":"US","type":"education","lineage":["https://openalex.org/I33434090"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"James Allan","raw_affiliation_strings":["College of Information and Computer Sciences, University of Massachusetts"],"affiliations":[{"raw_affiliation_string":"College of Information and Computer Sciences, University of Massachusetts","institution_ids":["https://openalex.org/I33434090"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5023862494"],"corresponding_institution_ids":["https://openalex.org/I33434090"],"apc_list":null,"apc_paid":null,"fwci":0.2079,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.65108469,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"122","last_page":"130"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8571404218673706},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.7594459056854248},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.6795841455459595},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6615790128707886},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5359376668930054},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.520749568939209},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.503924548625946},{"id":"https://openalex.org/keywords/document-clustering","display_name":"Document clustering","score":0.49135151505470276},{"id":"https://openalex.org/keywords/plain-text","display_name":"Plain text","score":0.48002973198890686},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.4371874928474426},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.4328058362007141}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8571404218673706},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.7594459056854248},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.6795841455459595},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6615790128707886},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5359376668930054},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.520749568939209},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.503924548625946},{"id":"https://openalex.org/C177937566","wikidata":"https://www.wikidata.org/wiki/Q4223102","display_name":"Document clustering","level":3,"score":0.49135151505470276},{"id":"https://openalex.org/C46503548","wikidata":"https://www.wikidata.org/wiki/Q1145976","display_name":"Plain text","level":3,"score":0.48002973198890686},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4371874928474426},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.4328058362007141},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C148730421","wikidata":"https://www.wikidata.org/wiki/Q141090","display_name":"Encryption","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/w17-4416","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/w17-4416","pdf_url":"https://www.aclweb.org/anthology/W17-4416.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 3rd Workshop on Noisy User-generated Text","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/w17-4416","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/w17-4416","pdf_url":"https://www.aclweb.org/anthology/W17-4416.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 3rd Workshop on Noisy User-generated Text","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.6499999761581421,"display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G2855679579","display_name":null,"funder_award_id":"1217281","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G5525219755","display_name":null,"funder_award_id":"IIS-1217281","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2760736794.pdf","grobid_xml":"https://content.openalex.org/works/W2760736794.grobid-xml"},"referenced_works_count":26,"referenced_works":["https://openalex.org/W1569657508","https://openalex.org/W1593320022","https://openalex.org/W1614298861","https://openalex.org/W1691409177","https://openalex.org/W2011296149","https://openalex.org/W2034103639","https://openalex.org/W2034797903","https://openalex.org/W2037121285","https://openalex.org/W2047334181","https://openalex.org/W2092772700","https://openalex.org/W2102524069","https://openalex.org/W2105549195","https://openalex.org/W2107092590","https://openalex.org/W2112692009","https://openalex.org/W2131257962","https://openalex.org/W2131744502","https://openalex.org/W2132083030","https://openalex.org/W2134164043","https://openalex.org/W2135164809","https://openalex.org/W2142635246","https://openalex.org/W2153635508","https://openalex.org/W2577020101","https://openalex.org/W2949547296","https://openalex.org/W3120421331","https://openalex.org/W4244046749","https://openalex.org/W4379510236"],"related_works":["https://openalex.org/W2329000834","https://openalex.org/W2165504147","https://openalex.org/W1990527953","https://openalex.org/W2030910246","https://openalex.org/W3093218477","https://openalex.org/W4287775364","https://openalex.org/W2529247374","https://openalex.org/W3029858749","https://openalex.org/W13099415","https://openalex.org/W2140970666"],"abstract_inverted_index":{"Technical":[0],"documents":[1],"contain":[2],"a":[3,66,80,115],"fair":[4],"amount":[5],"of":[6,22,33,44],"unnatural":[7,35,45,71,97,137],"language,":[8,39],"such":[9,51],"as":[10,52,59,123,125],"tables,":[11],"formulas,":[12],"and":[13,40,64,87,94,153],"pseudo-code.":[14],"Unnatural":[15],"language":[16,36,46,72,98,138],"can":[17,119],"be":[18],"an":[19,30,60,141],"important":[20],"factor":[21],"confusing":[23],"existing":[24],"NLP":[25,49],"tools.":[26],"This":[27],"paper":[28],"presents":[29],"effective":[31],"method":[32],"distinguishing":[34],"from":[37,110],"natural":[38],"evaluates":[41],"the":[42],"impact":[43],"detection":[47],"on":[48],"tasks":[50],"document":[53,145],"clustering.":[54],"We":[55,105],"view":[56],"this":[57],"problem":[58],"information":[61],"extraction":[62],"task":[63],"build":[65,114],"multiclass":[67],"classification":[68],"model":[69,117],"identifying":[70],"components":[73,99,139],"into":[74,102,129],"four":[75,103],"categories.":[76,104],"First,":[77],"we":[78],"create":[79],"new":[81],"annotated":[82,101],"corpus":[83,152],"by":[84,147],"collecting":[85],"slides":[86],"papers":[88],"in":[89,144],"various":[90],"formats,":[91],"PPT,":[92],"PDF,":[93],"HTML,":[95],"where":[96],"are":[100,155],"then":[106],"explore":[107],"features":[108],"available":[109],"plain":[111,130],"text":[112],"to":[113,149],"statistical":[116],"that":[118,135],"handle":[120],"any":[121],"format":[122],"long":[124],"it":[126],"is":[127],"converted":[128],"text.":[131],"Our":[132,151],"experiments":[133],"show":[134],"removing":[136],"gives":[140],"absolute":[142],"improvement":[143],"clustering":[146],"up":[148],"15%.":[150],"tool":[154],"publicly":[156],"available.":[157]},"counts_by_year":[{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":1}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
