{"id":"https://openalex.org/W2126176556","doi":"https://doi.org/10.1093/llc/fqm002","title":"Processing Internet-derived Text--Creating a Corpus of Usenet Messages","display_name":"Processing Internet-derived Text--Creating a Corpus of Usenet Messages","publication_year":2006,"publication_date":"2006-12-08","ids":{"openalex":"https://openalex.org/W2126176556","doi":"https://doi.org/10.1093/llc/fqm002","mag":"2126176556"},"language":"en","primary_location":{"id":"doi:10.1093/llc/fqm002","is_oa":false,"landing_page_url":"https://doi.org/10.1093/llc/fqm002","pdf_url":null,"source":{"id":"https://openalex.org/S84784070","display_name":"Literary and Linguistic Computing","issn_l":"0268-1145","issn":["0268-1145","1477-4615"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Literary and Linguistic Computing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5082420701","display_name":"Sabine Hoffmann","orcid":"https://orcid.org/0000-0001-6781-7718"},"institutions":[{"id":"https://openalex.org/I67415387","display_name":"Lancaster University","ror":"https://ror.org/04f2nsd36","country_code":"GB","type":"education","lineage":["https://openalex.org/I67415387"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"S. Hoffmann","raw_affiliation_strings":["Department of Linguistics and English Language, Bowland College, Lancaster University"],"affiliations":[{"raw_affiliation_string":"Department of Linguistics and English Language, Bowland College, Lancaster University","institution_ids":["https://openalex.org/I67415387"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5082420701"],"corresponding_institution_ids":["https://openalex.org/I67415387"],"apc_list":null,"apc_paid":null,"fwci":0.5739,"has_fulltext":false,"cited_by_count":19,"citation_normalized_percentile":{"value":0.69592413,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":"22","issue":"2","first_page":"151","last_page":"165"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13155","display_name":"Digital Communication and Language","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13155","display_name":"Digital Communication and Language","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.9918000102043152,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.9908999800682068,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/the-internet","display_name":"The Internet","score":0.6792728304862976},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6524444818496704},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.5545145273208618},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4061887562274933},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.38310158252716064}],"concepts":[{"id":"https://openalex.org/C110875604","wikidata":"https://www.wikidata.org/wiki/Q75","display_name":"The Internet","level":2,"score":0.6792728304862976},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6524444818496704},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.5545145273208618},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4061887562274933},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.38310158252716064}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1093/llc/fqm002","is_oa":false,"landing_page_url":"https://doi.org/10.1093/llc/fqm002","pdf_url":null,"source":{"id":"https://openalex.org/S84784070","display_name":"Literary and Linguistic Computing","issn_l":"0268-1145","issn":["0268-1145","1477-4615"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Literary and Linguistic Computing","raw_type":"journal-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.944.2285","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.944.2285","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://llc.oxfordjournals.org/content/22/2/151.full.pdf","raw_type":"text"},{"id":"pmh:oai:eprints.lancs.ac.uk:3942","is_oa":false,"landing_page_url":"https://eprints.lancs.ac.uk/id/eprint/3942/","pdf_url":null,"source":{"id":"https://openalex.org/S4306401916","display_name":"Lancaster EPrints (Lancaster University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67415387","host_organization_name":"Lancaster University","host_organization_lineage":["https://openalex.org/I67415387"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Journal Article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7799999713897705}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W47029222","https://openalex.org/W955055741","https://openalex.org/W960878893","https://openalex.org/W1982905731","https://openalex.org/W1998038442","https://openalex.org/W2040302415","https://openalex.org/W2045881275","https://openalex.org/W2074045704","https://openalex.org/W2075737140","https://openalex.org/W2171274829","https://openalex.org/W2477267530","https://openalex.org/W2482447406","https://openalex.org/W3165616014"],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2384888906","https://openalex.org/W2144190808","https://openalex.org/W2101955803","https://openalex.org/W2376314740","https://openalex.org/W2366644548","https://openalex.org/W2151447942","https://openalex.org/W2357241418","https://openalex.org/W2119214692","https://openalex.org/W2611614995"],"abstract_inverted_index":{"In":[0,70],"recent":[1],"years,":[2],"linguists":[3],"have":[4,140],"become":[5],"increasingly":[6],"interested":[7],"in":[8,47,81,99,135,170],"the":[9,12,60,64,75,121,131,152,166],"language":[10],"of":[11,17,24,59,66,86,95,144,174],"Internet\u2014both":[13],"as":[14,19,21,158],"an":[15,163],"object":[16],"investigation":[18,58,164],"well":[20,145],"a":[22,41,55,83,141,159],"source":[23],"authentic":[25],"data":[26,34,39],"to":[27,49,52,126,130],"complement":[28],"traditional":[29],"electronic":[30],"corpora.":[31],"However,":[32],"Internet-derived":[33],"is":[35,44],"typically":[36],"very":[37],"messy":[38],"and":[40,77,92,151],"conversion":[42],"process":[43],"often":[45],"required":[46,125],"order":[48],"enable":[50],"researchers":[51],"carry":[53],"out":[54],"reliable":[56],"quantitative":[57],"patterns":[61],"observed":[62],"with":[63],"help":[65],"standard":[67],"corpus":[68,85,154],"tools.":[69],"this":[71,171],"article,":[72],"I":[73,115],"discuss":[74],"technical":[76],"methodological":[78],"aspects":[79],"involved":[80],"creating":[82],"large":[84],"asynchronous":[87],"computer-mediated":[88],"communication":[89],"by":[90],"downloading":[91],"post-processing":[93],"hundreds":[94],"thousands":[96],"messages":[97,106],"posted":[98],"twelve":[100],"Usenet":[101],"newsgroups.":[102],"After":[103],"describing":[104],"how":[105],"can":[107,155],"be":[108],"arranged":[109],"into":[110,165],"hierarchically":[111],"structured":[112],"discussion":[113],"threads,":[114],"focus":[116],"at":[117],"some":[118],"length":[119],"on":[120],"strategies":[122,168],"that":[123],"are":[124],"correctly":[127],"assign":[128],"authorship":[129],"different":[132],"textual":[133],"elements":[134],"individual":[136],"messages.":[137],"My":[138],"algorithms":[139],"success":[142],"rate":[143],"over":[146],"90%":[147],"for":[148,162],"most":[149],"newsgroups":[150],"resulting":[153],"thus":[156],"serve":[157],"suitable":[160],"basis":[161],"interactive":[167],"employed":[169],"particular":[172],"type":[173],"written":[175],"communication.":[176]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1},{"year":2019,"cited_by_count":1},{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":1},{"year":2015,"cited_by_count":2},{"year":2014,"cited_by_count":1},{"year":2013,"cited_by_count":2},{"year":2012,"cited_by_count":1}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
