{"id":"https://openalex.org/W2000577984","doi":"https://doi.org/10.1145/1075389.1075392","title":"Web-based models for natural language processing","display_name":"Web-based models for natural language processing","publication_year":2005,"publication_date":"2005-02-01","ids":{"openalex":"https://openalex.org/W2000577984","doi":"https://doi.org/10.1145/1075389.1075392","mag":"2000577984"},"language":"en","primary_location":{"id":"doi:10.1145/1075389.1075392","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1075389.1075392","pdf_url":null,"source":{"id":"https://openalex.org/S200945739","display_name":"ACM Transactions on Speech and Language Processing","issn_l":"1550-4875","issn":["1550-4875","1550-4883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Speech and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5041024491","display_name":"Mirella Lapata","orcid":"https://orcid.org/0000-0002-2107-1516"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Mirella Lapata","raw_affiliation_strings":["University of Edinburgh, Edinburgh, UK","University of Edinburgh, Edinburgh , UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh, Edinburgh, UK","institution_ids":["https://openalex.org/I98677209"]},{"raw_affiliation_string":"University of Edinburgh, Edinburgh , UK","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5054936589","display_name":"Frank Keller","orcid":"https://orcid.org/0000-0002-8242-4362"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Frank Keller","raw_affiliation_strings":["University of Edinburgh, Edinburgh, UK","University of Edinburgh, Edinburgh , UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh, Edinburgh, UK","institution_ids":["https://openalex.org/I98677209"]},{"raw_affiliation_string":"University of Edinburgh, Edinburgh , UK","institution_ids":["https://openalex.org/I98677209"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5041024491"],"corresponding_institution_ids":["https://openalex.org/I98677209"],"apc_list":null,"apc_paid":null,"fwci":19.1812,"has_fulltext":false,"cited_by_count":170,"citation_normalized_percentile":{"value":0.99291622,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":100},"biblio":{"volume":"2","issue":"1","first_page":"3","last_page":"3"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8770323991775513},{"id":"https://openalex.org/keywords/bigram","display_name":"Bigram","score":0.8467056751251221},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6549339294433594},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5979337096214294},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5104769468307495},{"id":"https://openalex.org/keywords/syntax","display_name":"Syntax","score":0.49994730949401855},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.45364922285079956},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.4476197063922882},{"id":"https://openalex.org/keywords/web-application","display_name":"Web application","score":0.4332905411720276},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3819189965724945},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.2111593782901764},{"id":"https://openalex.org/keywords/trigram","display_name":"Trigram","score":0.17532536387443542},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.09841659665107727}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8770323991775513},{"id":"https://openalex.org/C108757681","wikidata":"https://www.wikidata.org/wiki/Q2773912","display_name":"Bigram","level":3,"score":0.8467056751251221},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6549339294433594},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5979337096214294},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5104769468307495},{"id":"https://openalex.org/C60048249","wikidata":"https://www.wikidata.org/wiki/Q37437","display_name":"Syntax","level":2,"score":0.49994730949401855},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.45364922285079956},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.4476197063922882},{"id":"https://openalex.org/C118643609","wikidata":"https://www.wikidata.org/wiki/Q189210","display_name":"Web application","level":2,"score":0.4332905411720276},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3819189965724945},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.2111593782901764},{"id":"https://openalex.org/C137546455","wikidata":"https://www.wikidata.org/wiki/Q3213474","display_name":"Trigram","level":2,"score":0.17532536387443542},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.09841659665107727}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/1075389.1075392","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1075389.1075392","pdf_url":null,"source":{"id":"https://openalex.org/S200945739","display_name":"ACM Transactions on Speech and Language Processing","issn_l":"1550-4875","issn":["1550-4875","1550-4883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Speech and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.106.5095","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.106.5095","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www.mti.ugm.ac.id/~adji/courses/resources/doctor/ACM/WebModelNLP05.pdf","raw_type":"text"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.107.4018","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.107.4018","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://homepages.inf.ed.ac.uk/keller/papers/tslp05.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.8100000023841858,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":67,"referenced_works":["https://openalex.org/W77146693","https://openalex.org/W97622340","https://openalex.org/W99075841","https://openalex.org/W198220271","https://openalex.org/W1516391399","https://openalex.org/W1517749756","https://openalex.org/W1519443010","https://openalex.org/W1540931780","https://openalex.org/W1550400585","https://openalex.org/W1578564752","https://openalex.org/W1581253957","https://openalex.org/W1591380337","https://openalex.org/W1601728146","https://openalex.org/W1632114991","https://openalex.org/W1648417313","https://openalex.org/W1733633583","https://openalex.org/W1859173823","https://openalex.org/W1949782377","https://openalex.org/W1976303135","https://openalex.org/W1988842251","https://openalex.org/W2008172256","https://openalex.org/W2014516359","https://openalex.org/W2015042937","https://openalex.org/W2021307530","https://openalex.org/W2029061393","https://openalex.org/W2047295649","https://openalex.org/W2061023833","https://openalex.org/W2064070185","https://openalex.org/W2068017609","https://openalex.org/W2072481766","https://openalex.org/W2076002267","https://openalex.org/W2083783479","https://openalex.org/W2094971681","https://openalex.org/W2099685860","https://openalex.org/W2102078257","https://openalex.org/W2102967855","https://openalex.org/W2104232304","https://openalex.org/W2106022904","https://openalex.org/W2108239140","https://openalex.org/W2112642950","https://openalex.org/W2115983295","https://openalex.org/W2116225596","https://openalex.org/W2118996379","https://openalex.org/W2119633152","https://openalex.org/W2119966617","https://openalex.org/W2121357917","https://openalex.org/W2124416056","https://openalex.org/W2128237409","https://openalex.org/W2134517267","https://openalex.org/W2135375068","https://openalex.org/W2137076533","https://openalex.org/W2157963512","https://openalex.org/W2159086733","https://openalex.org/W2295081780","https://openalex.org/W2343954916","https://openalex.org/W2620138912","https://openalex.org/W2620949368","https://openalex.org/W2740887992","https://openalex.org/W2912172494","https://openalex.org/W2913539360","https://openalex.org/W2913668833","https://openalex.org/W2915027006","https://openalex.org/W2952886287","https://openalex.org/W3020840689","https://openalex.org/W4285719527","https://openalex.org/W6638899456","https://openalex.org/W6644274597"],"related_works":["https://openalex.org/W1700330385","https://openalex.org/W2105076537","https://openalex.org/W2041167939","https://openalex.org/W2002221802","https://openalex.org/W2250909759","https://openalex.org/W2562995433","https://openalex.org/W2131111393","https://openalex.org/W1500873938","https://openalex.org/W2020757772","https://openalex.org/W1515897616"],"abstract_inverted_index":{"Previous":[0],"work":[1],"demonstrated":[2],"that":[3,14,97,132,156],"Web":[4,111,134],"counts":[5,106,135],"can":[6,122],"be":[7,18,123,161],"used":[8,162],"to":[9,145],"approximate":[10],"bigram":[11],"counts,":[12],"suggesting":[13],"Web-based":[15,58,141,157],"frequencies":[16],"should":[17,159],"useful":[19],"for":[20,60],"a":[21,32,74,115,164],"wide":[22],"variety":[23],"of":[24,35,57,77,82,92],"Natural":[25],"Language":[26],"Processing":[27],"(NLP)":[28],"tasks.":[29],"However,":[30,139],"only":[31],"limited":[33],"number":[34],"tasks":[36],"have":[37,85],"so":[38],"far":[39],"been":[40,86],"tested":[41],"using":[42,127],"Web-scale":[43],"data":[44],"sets.":[45],"The":[46],"present":[47],"article":[48],"overcomes":[49],"this":[50],"limitation":[51],"by":[52,126],"systematically":[53],"investigating":[54],"the":[55,90,110],"performance":[56,121],"models":[59,100,142,149,158],"several":[61],"NLP":[62],"tasks,":[63,94],"covering":[64],"both":[65,69],"syntax":[66],"and":[67,71,73,80,136],"semantics,":[68],"generation":[70],"analysis,":[72],"wider":[75],"range":[76],"n":[78,104],"-grams":[79],"parts":[81],"speech":[83],"than":[84,113,168],"previously":[87],"explored.":[88],"For":[89],"majority":[91],"our":[93],"we":[95],"find":[96],"simple,":[98],"unsupervised":[99,140],"perform":[101],"better":[102],"when":[103],"-gram":[105],"are":[107],"obtained":[108],"from":[109,114],"rather":[112,167],"large":[116],"corpus.":[117],"In":[118],"some":[119],"cases,":[120],"improved":[124],"further":[125],"backoff":[128],"or":[129],"interpolation":[130],"techniques":[131],"combine":[133],"corpus":[137],"counts.":[138],"generally":[143],"fail":[144],"outperform":[146],"supervised":[147,173],"state-of-the-art":[148],"trained":[150],"on":[151],"smaller":[152],"corpora.":[153],"We":[154],"argue":[155],"therefore":[160],"as":[163],"baseline":[165],"for,":[166],"an":[169],"alternative":[170],"to,":[171],"standard":[172],"models.":[174]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":4},{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":4},{"year":2019,"cited_by_count":2},{"year":2018,"cited_by_count":3},{"year":2017,"cited_by_count":5},{"year":2016,"cited_by_count":2},{"year":2015,"cited_by_count":8},{"year":2014,"cited_by_count":3},{"year":2013,"cited_by_count":11},{"year":2012,"cited_by_count":18}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
