{"id":"https://openalex.org/W2008434289","doi":"https://doi.org/10.1162/089120101300346787","title":"Using Suffix Arrays to Compute Term Frequency and Document Frequency for All Substrings in a Corpus","display_name":"Using Suffix Arrays to Compute Term Frequency and Document Frequency for All Substrings in a Corpus","publication_year":2001,"publication_date":"2001-03-01","ids":{"openalex":"https://openalex.org/W2008434289","doi":"https://doi.org/10.1162/089120101300346787","mag":"2008434289"},"language":"en","primary_location":{"id":"doi:10.1162/089120101300346787","is_oa":true,"landing_page_url":"https://doi.org/10.1162/089120101300346787","pdf_url":"http://www.mitpressjournals.org/doi/pdf/10.1162/089120101300346787","source":{"id":"https://openalex.org/S155526855","display_name":"Computational Linguistics","issn_l":"0891-2017","issn":["0891-2017","1530-9312"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computational Linguistics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"bronze","oa_url":"http://www.mitpressjournals.org/doi/pdf/10.1162/089120101300346787","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112451327","display_name":"Mikio Yamamoto","orcid":null},"institutions":[{"id":"https://openalex.org/I146399215","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07","country_code":"JP","type":"education","lineage":["https://openalex.org/I146399215"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Mikio Yamamoto","raw_affiliation_strings":["University of Tsukuba, Institute of Information Sciences and Electronics, 1-1-1 Tennodai, Tsukuba 305-8573, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Tsukuba, Institute of Information Sciences and Electronics, 1-1-1 Tennodai, Tsukuba 305-8573, Japan","institution_ids":["https://openalex.org/I146399215"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016543371","display_name":"Kenneth Church","orcid":"https://orcid.org/0000-0001-8378-6069"},"institutions":[{"id":"https://openalex.org/I1283103587","display_name":"AT&T (United States)","ror":"https://ror.org/02bbd5539","country_code":"US","type":"company","lineage":["https://openalex.org/I1283103587"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kenneth W. Church","raw_affiliation_strings":["AT&T Labs\u2014Research, 180 Park Avenue, Florham Park, NJ 07932"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AT&T Labs\u2014Research, 180 Park Avenue, Florham Park, NJ 07932","institution_ids":["https://openalex.org/I1283103587"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5112451327"],"corresponding_institution_ids":["https://openalex.org/I146399215"],"apc_list":null,"apc_paid":null,"fwci":14.7583,"has_fulltext":true,"cited_by_count":173,"citation_normalized_percentile":{"value":0.9883726,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":"27","issue":"1","first_page":"1","last_page":"30"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9843000173568726,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/substring","display_name":"Substring","score":0.9306526184082031},{"id":"https://openalex.org/keywords/trigram","display_name":"Trigram","score":0.8141581416130066},{"id":"https://openalex.org/keywords/bigram","display_name":"Bigram","score":0.7291061282157898},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7066342234611511},{"id":"https://openalex.org/keywords/term","display_name":"Term (time)","score":0.6110744476318359},{"id":"https://openalex.org/keywords/suffix","display_name":"Suffix","score":0.5908170342445374},{"id":"https://openalex.org/keywords/tf\u2013idf","display_name":"tf\u2013idf","score":0.5798121690750122},{"id":"https://openalex.org/keywords/n-gram","display_name":"n-gram","score":0.5429664850234985},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5163965821266174},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.44275155663490295},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3409759998321533},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3385491371154785},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.2552943229675293},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.16035911440849304},{"id":"https://openalex.org/keywords/data-structure","display_name":"Data structure","score":0.15383297204971313}],"concepts":[{"id":"https://openalex.org/C182407805","wikidata":"https://www.wikidata.org/wiki/Q2626534","display_name":"Substring","level":3,"score":0.9306526184082031},{"id":"https://openalex.org/C137546455","wikidata":"https://www.wikidata.org/wiki/Q3213474","display_name":"Trigram","level":2,"score":0.8141581416130066},{"id":"https://openalex.org/C108757681","wikidata":"https://www.wikidata.org/wiki/Q2773912","display_name":"Bigram","level":3,"score":0.7291061282157898},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7066342234611511},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.6110744476318359},{"id":"https://openalex.org/C2779804580","wikidata":"https://www.wikidata.org/wiki/Q102047","display_name":"Suffix","level":2,"score":0.5908170342445374},{"id":"https://openalex.org/C81758059","wikidata":"https://www.wikidata.org/wiki/Q796584","display_name":"tf\u2013idf","level":3,"score":0.5798121690750122},{"id":"https://openalex.org/C117884012","wikidata":"https://www.wikidata.org/wiki/Q94489","display_name":"n-gram","level":3,"score":0.5429664850234985},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5163965821266174},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44275155663490295},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3409759998321533},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3385491371154785},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2552943229675293},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.16035911440849304},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.15383297204971313},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1162/089120101300346787","is_oa":true,"landing_page_url":"https://doi.org/10.1162/089120101300346787","pdf_url":"http://www.mitpressjournals.org/doi/pdf/10.1162/089120101300346787","source":{"id":"https://openalex.org/S155526855","display_name":"Computational Linguistics","issn_l":"0891-2017","issn":["0891-2017","1530-9312"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computational Linguistics","raw_type":"journal-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.14.6778","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.6778","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://acl.ldc.upenn.edu/W/W98/W98-1104.pdf","raw_type":"text"},{"id":"pmh:oai:doaj.org/article:5b363e0d624c4fe88c1b2fe5306fe6b2","is_oa":false,"landing_page_url":"https://doaj.org/article/5b363e0d624c4fe88c1b2fe5306fe6b2","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Computational Linguistics, Vol 27, Iss 1 (2021)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1162/089120101300346787","is_oa":true,"landing_page_url":"https://doi.org/10.1162/089120101300346787","pdf_url":"http://www.mitpressjournals.org/doi/pdf/10.1162/089120101300346787","source":{"id":"https://openalex.org/S155526855","display_name":"Computational Linguistics","issn_l":"0891-2017","issn":["0891-2017","1530-9312"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computational Linguistics","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.7200000286102295,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2008434289.pdf","grobid_xml":"https://content.openalex.org/works/W2008434289.grobid-xml"},"referenced_works_count":19,"referenced_works":["https://openalex.org/W186138571","https://openalex.org/W1508165687","https://openalex.org/W1573714593","https://openalex.org/W1593045043","https://openalex.org/W1986535287","https://openalex.org/W1990061958","https://openalex.org/W1994851566","https://openalex.org/W2015201047","https://openalex.org/W2024490156","https://openalex.org/W2031015851","https://openalex.org/W2059513841","https://openalex.org/W2121252285","https://openalex.org/W2134237567","https://openalex.org/W2144211451","https://openalex.org/W2158874082","https://openalex.org/W2162945593","https://openalex.org/W2996160789","https://openalex.org/W4238346259","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W2940857995","https://openalex.org/W2250909759","https://openalex.org/W4327499987","https://openalex.org/W2917105722","https://openalex.org/W2164394510","https://openalex.org/W2921680427","https://openalex.org/W4288374102","https://openalex.org/W2463816369","https://openalex.org/W2950765678","https://openalex.org/W7593531"],"abstract_inverted_index":{"Bigrams":[0],"and":[1,25,35,91,102,124,231,250],"trigrams":[2],"are":[3,60,202],"commonly":[4],"used":[5,96],"in":[6,41,55,109,152,258],"statistical":[7],"natural":[8],"language":[9],"processing;":[10],"this":[11,70],"paper":[12,86,140],"will":[13],"describe":[14],"techniques":[15],"for":[16,106,234],"working":[17],"with":[18,154,197,214],"much":[19],"longer":[20],"n-grams.":[21],"Suffix":[22],"arrays":[23],"(Manber":[24],"Myers":[26],"1990)":[27],"were":[28,95],"first":[29],"introduced":[30],"to":[31,79,97,144,190,210,226,239],"compute":[32,49,98],"the":[33,58,89,92,139,160,175,178,206,220],"frequency":[34,100,104,163,185,189,201],"location":[36],"of":[37,45,66,116,120,128,132,138,177,193,247],"a":[38,42,56,63,72,80,125,198,259],"substring":[39],"(n-gram)":[40],"sequence":[43],"(corpus)":[44],"length":[46],"N.":[47],"To":[48],"frequencies":[50,143],"over":[51,75,83,243],"all":[52,107],"N(N+1)/2":[53],"substrings":[54,59,76],"corpus,":[57],"grouped":[61],"into":[62],"manageable":[64,81],"number":[65],"equivalence":[67],"classes.":[68,84],"In":[69],"way,":[71],"prohibitive":[73],"computation":[74,82],"is":[77,164,252],"reduced":[78],"This":[85],"presents":[87],"both":[88,248],"algorithms":[90],"code":[93],"that":[94,174],"term":[99,162,200],"(tf)":[101],"document":[103,184,188],"(df)":[105],"n-grams":[108,153],"two":[110],"large":[111],"corpora,":[112],"an":[113],"English":[114],"corpus":[115,127],"50":[117],"million":[118,130],"words":[119],"Wall":[121],"Street":[122],"Journal":[123],"Japanese":[126,260],"216":[129],"characters":[131],"Mainichi":[133],"Shimbun.":[134],"The":[135,245],"second":[136],"half":[137],"uses":[141],"these":[142],"find":[145],"\u201cinteresting\u201d":[146],"substrings.":[147],"Lexicographers":[148],"have":[149],"been":[150],"interested":[151],"high":[155],"mutual":[156],"information":[157,235],"(MI)":[158],"where":[159,195],"joint":[161],"higher":[165],"than":[166,254],"what":[167],"would":[168],"be":[169],"expected":[170],"by":[171,256],"chance,":[172],"assuming":[173],"parts":[176],"n-gram":[179],"combine":[180],"independently.":[181],"Residual":[182],"inverse":[183],"(RIDF)":[186],"compares":[187],"another":[191],"model":[192],"chance":[194],"terms":[196],"particular":[199],"distributed":[203],"randomly":[204],"throughout":[205],"collection.":[207],"MI":[208,249],"tends":[209,225],"pick":[211],"out":[212],"phrases":[213],"noncompositional":[215],"semantics":[216],"(which":[217,237],"often":[218],"violate":[219],"independence":[221],"assumption)":[222],"whereas":[223],"RIDF":[224,251],"highlight":[227],"technical":[228],"terminology,":[229],"names,":[230],"good":[232],"keywords":[233],"retrieval":[236],"tend":[238],"exhibit":[240],"nonrandom":[241],"distributions":[242],"documents).":[244],"combination":[246],"better":[253],"either":[255],"itself":[257],"word":[261],"extraction":[262],"task.":[263]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":1},{"year":2022,"cited_by_count":5},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":5},{"year":2019,"cited_by_count":4},{"year":2018,"cited_by_count":2},{"year":2017,"cited_by_count":6},{"year":2016,"cited_by_count":6},{"year":2015,"cited_by_count":7},{"year":2014,"cited_by_count":7},{"year":2013,"cited_by_count":10},{"year":2012,"cited_by_count":5}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
