{"id":"https://openalex.org/W1980160594","doi":"https://doi.org/10.1145/1451940.1451972","title":"Improved count suffix trees for natural language data","display_name":"Improved count suffix trees for natural language data","publication_year":2008,"publication_date":"2008-01-01","ids":{"openalex":"https://openalex.org/W1980160594","doi":"https://doi.org/10.1145/1451940.1451972","mag":"1980160594"},"language":"en","primary_location":{"id":"doi:10.1145/1451940.1451972","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1451940.1451972","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2008 international symposium on Database engineering &amp; applications - IDEAS '08","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040076607","display_name":"Guido Sautter","orcid":"https://orcid.org/0000-0002-6073-3658"},"institutions":[{"id":"https://openalex.org/I4210119349","display_name":"Karlsruhe University of Education","ror":"https://ror.org/01t1kq612","country_code":"DE","type":"education","lineage":["https://openalex.org/I4210119349"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Guido Sautter","raw_affiliation_strings":["Universit\u00e4t Karlsruhe (TH), Karlsruhe"],"affiliations":[{"raw_affiliation_string":"Universit\u00e4t Karlsruhe (TH), Karlsruhe","institution_ids":["https://openalex.org/I4210119349"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059655596","display_name":"Cristina Abba","orcid":null},"institutions":[{"id":"https://openalex.org/I4210128445","display_name":"Torino e-district","ror":"https://ror.org/03cxxc369","country_code":"IT","type":"facility","lineage":["https://openalex.org/I4210128445"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Cristina Abba","raw_affiliation_strings":["Corso Francia, Torino"],"affiliations":[{"raw_affiliation_string":"Corso Francia, Torino","institution_ids":["https://openalex.org/I4210128445"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049768806","display_name":"Klemens B\u00f6hm","orcid":"https://orcid.org/0000-0002-1706-1913"},"institutions":[{"id":"https://openalex.org/I4210119349","display_name":"Karlsruhe University of Education","ror":"https://ror.org/01t1kq612","country_code":"DE","type":"education","lineage":["https://openalex.org/I4210119349"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Klemens B\u00f6hm","raw_affiliation_strings":["Universit\u00e4t Karlsruhe (TH), Karlsruhe"],"affiliations":[{"raw_affiliation_string":"Universit\u00e4t Karlsruhe (TH), Karlsruhe","institution_ids":["https://openalex.org/I4210119349"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5040076607"],"corresponding_institution_ids":["https://openalex.org/I4210119349"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.06371324,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"231","last_page":"231"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9865000247955322,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7735663652420044},{"id":"https://openalex.org/keywords/suffix","display_name":"Suffix","score":0.7084702253341675},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5271227955818176},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.4553946852684021},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4097062349319458},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.1274460256099701}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7735663652420044},{"id":"https://openalex.org/C2779804580","wikidata":"https://www.wikidata.org/wiki/Q102047","display_name":"Suffix","level":2,"score":0.7084702253341675},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5271227955818176},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.4553946852684021},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4097062349319458},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.1274460256099701},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/1451940.1451972","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1451940.1451972","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2008 international symposium on Database engineering &amp; applications - IDEAS '08","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.569.3789","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.569.3789","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://dbis.ipd.kit.edu/download/sautter_IDEAS08.pdf","raw_type":"text"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.741.483","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.741.483","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www.cscjournals.org/manuscript/Journals/IJDE/Volume3/Issue1/IJDE-76.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7599999904632568,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W26591655","https://openalex.org/W73596949","https://openalex.org/W1598265240","https://openalex.org/W1660390307","https://openalex.org/W1974025666","https://openalex.org/W1982889956","https://openalex.org/W1984629602","https://openalex.org/W1986324942","https://openalex.org/W2010595692","https://openalex.org/W2016768890","https://openalex.org/W2058200372","https://openalex.org/W2059513841","https://openalex.org/W2075833184","https://openalex.org/W2094069950","https://openalex.org/W2098162425","https://openalex.org/W2121252285","https://openalex.org/W2123888919","https://openalex.org/W2133363953","https://openalex.org/W2147440220","https://openalex.org/W2167439683","https://openalex.org/W2460754155","https://openalex.org/W2533248932","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2371678724","https://openalex.org/W4311055779","https://openalex.org/W2761558751","https://openalex.org/W3128574596","https://openalex.org/W2391300236","https://openalex.org/W2981728181","https://openalex.org/W4226226396","https://openalex.org/W3153750606","https://openalex.org/W4308854837"],"abstract_inverted_index":{"With":[0],"more":[1,3,171],"and":[2,59,102,115,165,184,195,237],"natural":[4,130],"language":[5,131,197],"text":[6,218],"stored":[7],"in":[8,71,141,225],"databases,":[9],"handling":[10],"respective":[11],"query":[12,29,37],"predicates":[13,20],"becomes":[14],"very":[15],"important.":[16],"Optimizing":[17],"queries":[18],"with":[19,215],"includes":[21],"(sub)string":[22],"estimation,":[23],"i.e.,":[24,133],"estimating":[25],"the":[26,72,108,116,129,136,178,209,228,239,252,259],"selectivity":[27,255],"of":[28,64,75,110,118,128,180,208,230,254],"terms":[30,172],"based":[31,88],"on":[32,89],"small":[33],"summary":[34],"statistics":[35],"before":[36],"execution.":[38],"Count":[39],"Suffix":[40],"Trees":[41],"(CST)":[42],"are":[43,55,87,139,149],"commonly":[44],"used":[45],"to":[46,57,66,80,125,157,189,235,264],"this":[47,96],"end.":[48],"While":[49],"CST":[50,111,210,232,260],"yield":[51],"good":[52],"estimates,":[53],"they":[54,78],"expensive":[56],"build":[58,240],"require":[60],"a":[61,142,151,162,185,231,244],"large":[62,216],"amount":[63],"memory":[65],"be":[67,81],"stored.":[68],"To":[69],"fit":[70],"data":[73],"dictionary":[74],"database":[76],"systems,":[77],"have":[79],"severely":[82],"pruned.":[83],"Existing":[84],"pruning":[85,103],"techniques":[86,104,183],"suffix":[90],"frequency":[91],"or":[92],"tree":[93],"depth.":[94],"In":[95],"paper,":[97],"we":[98],"propose":[99],"new":[100,152,163,186,223],"filtering":[101],"that":[105,138,169,221],"reduce":[106],"both":[107],"size":[109,229],"over":[112],"natural-language":[113],"texts":[114],"cost":[117],"building":[119],"them.":[120],"The":[121,145],"core":[122],"idea":[123],"is":[124],"exploit":[126],"features":[127],"data,":[132],"regarding":[134],"only":[135],"suffixes":[137],"useful":[140],"linguistic":[143],"sense.":[144],"most":[146],"important":[147],"innovations":[148],"(a)":[150],"aggressive":[153],"approximate":[154],"syllabification":[155],"technique":[156],"filter":[158,190],"out":[159,191],"suffixes,":[160],"(b)":[161],"affix":[164],"prefix":[166],"stripping":[167],"procedure":[168],"conflates":[170],"than":[173],"conventional":[174],"stemming":[175],"techniques,":[176],"(c)":[177],"deployment":[179],"state-of-the-art":[181],"trigram":[182],"syllable-based":[187],"mechanism":[188],"non-words":[192],"(i.e.,":[193],"misspellings":[194],"other":[196],"anomalies":[198],"like":[199],"foreign":[200],"words),":[201],"which":[202],"would":[203],"cause":[204],"an":[205],"over-proportional":[206],"growth":[207],"otherwise.":[211],"--":[212],"Our":[213],"evaluation":[214],"English":[217],"corpora":[219],"shows":[220],"our":[222],"mechanisms":[224],"combination":[226],"decrease":[227],"by":[233,262],"up":[234,263],"80%":[236],"shorten":[238],"phase":[241],"significantly.":[242],"From":[243],"different":[245],"perspective,":[246],"if":[247],"storage":[248],"space":[249],"remains":[250],"unchanged,":[251],"accuracy":[253],"estimates":[256],"computed":[257],"from":[258],"increases":[261],"70%.":[265]},"counts_by_year":[{"year":2021,"cited_by_count":1}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
