{"id":"https://openalex.org/W7129025022","doi":"https://doi.org/10.48550/arxiv.2602.13194","title":"Semantic Chunking and the Entropy of Natural Language","display_name":"Semantic Chunking and the Entropy of Natural Language","publication_year":2026,"publication_date":"2026-02-13","ids":{"openalex":"https://openalex.org/W7129025022","doi":"https://doi.org/10.48550/arxiv.2602.13194"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.13194","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126151057","display_name":"Weishun Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhong, Weishun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026920231","display_name":"Doron Sivan","orcid":"https://orcid.org/0000-0002-1422-5505"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sivan, Doron","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027344782","display_name":"Tankut Can","orcid":"https://orcid.org/0000-0002-0999-2355"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Can, Tankut","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Katkov, Mikhail","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Katkov, Mikhail","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126073192","display_name":"Misha Tsodyks","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tsodyks, Misha","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5126151057"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.40700000524520874,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.40700000524520874,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.20550000667572021,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.1404999941587448,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5828999876976013},{"id":"https://openalex.org/keywords/entropy","display_name":"Entropy (arrow of time)","score":0.5784000158309937},{"id":"https://openalex.org/keywords/entropy-rate","display_name":"Entropy rate","score":0.5508000254631042},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.47119998931884766},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.45829999446868896},{"id":"https://openalex.org/keywords/principle-of-maximum-entropy","display_name":"Principle of maximum entropy","score":0.38260000944137573},{"id":"https://openalex.org/keywords/chunking","display_name":"Chunking (psychology)","score":0.3594000041484833}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.656499981880188},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5828999876976013},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5795999765396118},{"id":"https://openalex.org/C106301342","wikidata":"https://www.wikidata.org/wiki/Q4117933","display_name":"Entropy (arrow of time)","level":2,"score":0.5784000158309937},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5641999840736389},{"id":"https://openalex.org/C125252325","wikidata":"https://www.wikidata.org/wiki/Q1345213","display_name":"Entropy rate","level":4,"score":0.5508000254631042},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.47119998931884766},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.45829999446868896},{"id":"https://openalex.org/C9679016","wikidata":"https://www.wikidata.org/wiki/Q1417473","display_name":"Principle of maximum entropy","level":2,"score":0.38260000944137573},{"id":"https://openalex.org/C203357204","wikidata":"https://www.wikidata.org/wiki/Q1089605","display_name":"Chunking (psychology)","level":2,"score":0.3594000041484833},{"id":"https://openalex.org/C94922259","wikidata":"https://www.wikidata.org/wiki/Q33215","display_name":"Constructed language","level":2,"score":0.34380000829696655},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.33239999413490295},{"id":"https://openalex.org/C52622258","wikidata":"https://www.wikidata.org/wiki/Q131222","display_name":"Information theory","level":2,"score":0.3206999897956848},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.3125999867916107},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.30239999294281006},{"id":"https://openalex.org/C171752962","wikidata":"https://www.wikidata.org/wiki/Q255166","display_name":"Kullback\u2013Leibler divergence","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2639000117778778},{"id":"https://openalex.org/C202708506","wikidata":"https://www.wikidata.org/wiki/Q7449050","display_name":"Semantic compression","level":5,"score":0.2599000036716461}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.13194","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.13194","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.13194","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.13194","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.7487589716911316,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,92,133],"entropy":[1,29,134,144,156],"rate":[2,30,135,145,157],"of":[3,63,70,79,95,123,129,146,158,172],"printed":[4,147],"English":[5,33],"is":[6,161],"famously":[7],"estimated":[8,143],"to":[9,40,57,88],"be":[10,100],"about":[11],"one":[12],"bit":[13],"per":[14,44],"character,":[15],"a":[16,52,67,77],"benchmark":[17],"that":[18,32,55,116,154],"modern":[19,110],"large":[20],"language":[21,160],"models":[22],"(LLMs)":[23],"have":[24],"only":[25,179],"recently":[26],"approached.":[27],"This":[28],"implies":[31],"contains":[34],"nearly":[35],"80":[36],"percent":[37],"redundancy":[38,72],"relative":[39],"the":[41,59,89,96,121,130,142,155,169,178],"five":[42],"bits":[43],"character":[45],"expected":[46],"for":[47,104],"random":[48],"text.":[49],"We":[50],"introduce":[51],"statistical":[53],"model":[54,75,118,139],"attempts":[56],"capture":[58],"intricate":[60],"multi-scale":[61],"structure":[62,94,122],"natural":[64,159],"language,":[65],"providing":[66],"first-principles":[68],"account":[69],"this":[71],"level.":[73,91],"Our":[74],"describes":[76],"procedure":[78],"self-similarly":[80],"segmenting":[81],"text":[82,97],"into":[83],"semantically":[84],"coherent":[85],"chunks":[86],"down":[87],"single-word":[90],"semantic":[93,131,170],"can":[98],"then":[99],"hierarchically":[101],"decomposed,":[102],"allowing":[103],"analytical":[105],"treatment.":[106],"Numerical":[107],"experiments":[108],"with":[109,141,168],"LLMs":[111],"and":[112],"open":[113],"datasets":[114],"suggest":[115],"our":[117,138,150,183],"quantitatively":[119],"captures":[120],"real":[124],"texts":[125],"at":[126],"different":[127],"levels":[128],"hierarchy.":[132],"predicted":[136],"by":[137,177],"agrees":[140],"English.":[148],"Moreover,":[149],"theory":[151],"further":[152],"reveals":[153],"not":[162],"fixed":[163],"but":[164],"should":[165],"increase":[166],"systematically":[167],"complexity":[171],"corpora,":[173],"which":[174],"are":[175],"captured":[176],"free":[180],"parameter":[181],"in":[182],"model.":[184]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-17T00:00:00"}
