{"id":"https://openalex.org/W7131875087","doi":"https://doi.org/10.48550/arxiv.2602.22958","title":"Frequency-Ordered Tokenization for Better Text Compression","display_name":"Frequency-Ordered Tokenization for Better Text Compression","publication_year":2026,"publication_date":"2026-02-26","ids":{"openalex":"https://openalex.org/W7131875087","doi":"https://doi.org/10.48550/arxiv.2602.22958"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.22958","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093713855","display_name":"Maximilian Kalcher","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kalcher, Maximilian","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5093713855"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11567","display_name":"semigroups and automata theory","score":0.0010000000474974513,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.0008999999845400453,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/lossless-compression","display_name":"Lossless compression","score":0.7555999755859375},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.6486999988555908},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.5756000280380249},{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.5374000072479248},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.5055000185966492},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4961000084877014},{"id":"https://openalex.org/keywords/byte","display_name":"Byte","score":0.4876999855041504},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.4869000017642975},{"id":"https://openalex.org/keywords/compression","display_name":"Compression (physics)","score":0.4706999957561493},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4440000057220459}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7634000182151794},{"id":"https://openalex.org/C81081738","wikidata":"https://www.wikidata.org/wiki/Q55542","display_name":"Lossless compression","level":3,"score":0.7555999755859375},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.6486999988555908},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.5756000280380249},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.5374000072479248},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.5055000185966492},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4961000084877014},{"id":"https://openalex.org/C43364308","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Byte","level":2,"score":0.4876999855041504},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4871000051498413},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.4869000017642975},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.4706999957561493},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4440000057220459},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.42809998989105225},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.41440001130104065},{"id":"https://openalex.org/C141603448","wikidata":"https://www.wikidata.org/wiki/Q134830","display_name":"Prefix","level":2,"score":0.4000000059604645},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3930000066757202},{"id":"https://openalex.org/C182407805","wikidata":"https://www.wikidata.org/wiki/Q2626534","display_name":"Substring","level":3,"score":0.359499990940094},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.33329999446868896},{"id":"https://openalex.org/C97137487","wikidata":"https://www.wikidata.org/wiki/Q729138","display_name":"Integer (computer science)","level":2,"score":0.32510000467300415},{"id":"https://openalex.org/C46900642","wikidata":"https://www.wikidata.org/wiki/Q2647","display_name":"Huffman coding","level":3,"score":0.3237000107765198},{"id":"https://openalex.org/C198352243","wikidata":"https://www.wikidata.org/wiki/Q37105","display_name":"Line (geometry)","level":2,"score":0.314300000667572},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.3122999966144562},{"id":"https://openalex.org/C25797200","wikidata":"https://www.wikidata.org/wiki/Q828137","display_name":"Compression ratio","level":3,"score":0.29280000925064087},{"id":"https://openalex.org/C149441793","wikidata":"https://www.wikidata.org/wiki/Q200726","display_name":"Probability distribution","level":2,"score":0.29120001196861267},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.28690001368522644},{"id":"https://openalex.org/C94835093","wikidata":"https://www.wikidata.org/wiki/Q3113333","display_name":"Data compression ratio","level":5,"score":0.2858999967575073},{"id":"https://openalex.org/C98501671","wikidata":"https://www.wikidata.org/wiki/Q1948408","display_name":"Text segmentation","level":3,"score":0.2847000062465668},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C175293574","wikidata":"https://www.wikidata.org/wiki/Q697133","display_name":"Word lists by frequency","level":3,"score":0.26809999346733093},{"id":"https://openalex.org/C112313634","wikidata":"https://www.wikidata.org/wiki/Q7886648","display_name":"Complement (music)","level":5,"score":0.2614000141620636},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.26089999079704285},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.26089999079704285},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.25540000200271606}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.22958","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.22958","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.22958","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.22958","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5113201141357422}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,107],"present":[1],"frequency-ordered":[2],"tokenization,":[3],"a":[4],"simple":[5],"preprocessing":[6,111,123],"technique":[7],"that":[8,38,110],"improves":[9],"lossless":[10],"text":[11,28],"compression":[12,113],"by":[13],"exploiting":[14],"the":[15,35,47,88,118,137],"power-law":[16],"frequency":[17],"distribution":[18],"of":[19,67,152],"natural":[20],"language":[21],"tokens":[22,40],"(Zipf's":[23],"law).":[24],"The":[25,143],"method":[26,144],"tokenizes":[27],"with":[29,49],"Byte":[30],"Pair":[31],"Encoding":[32],"(BPE),":[33],"reorders":[34],"vocabulary":[36,85],"so":[37],"frequent":[39],"receive":[41],"small":[42],"integer":[43],"identifiers,":[44],"and":[45,78,101,104,130],"encodes":[46],"result":[48],"variable-length":[50],"integers":[51],"before":[52],"passing":[53],"it":[54],"to":[55],"any":[56],"standard":[57],"compressor.":[58],"On":[59],"enwik8":[60],"(100":[61],"MB":[62],"Wikipedia),":[63],"this":[64],"yields":[65],"improvements":[66],"7.08":[68],"percentage":[69],"points":[70],"(pp)":[71],"for":[72,76,81,114],"zlib,":[73],"1.69":[74],"pp":[75,80],"LZMA,":[77,135],"0.76":[79],"zstd":[82],"(all":[83],"including":[84,122],"overhead),":[86],"outperforming":[87],"classical":[89],"Word":[90],"Replacing":[91],"Transform.":[92],"Gains":[93],"are":[94],"consistent":[95],"at":[96],"1":[97],"GB":[98],"scale":[99],"(enwik9)":[100],"across":[102],"Chinese":[103],"Arabic":[105],"text.":[106],"further":[108],"show":[109],"accelerates":[112],"computationally":[115],"expensive":[116],"algorithms:":[117],"total":[119],"wall-clock":[120],"time":[121],"is":[124,140],"3.1x":[125],"faster":[126,132],"than":[127,133],"raw":[128,134],"zstd-22":[129],"2.4x":[131],"because":[136],"preprocessed":[138],"input":[139],"substantially":[141],"smaller.":[142],"can":[145],"be":[146],"implemented":[147],"in":[148],"under":[149],"50":[150],"lines":[151],"code.":[153]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-28T00:00:00"}
