{"id":"https://openalex.org/W4409872456","doi":"https://doi.org/10.1162/tacl_a_00747","title":"How Much Semantic Information is Available in Large Language Model Tokens?","display_name":"How Much Semantic Information is Available in Large Language Model Tokens?","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4409872456","doi":"https://doi.org/10.1162/tacl_a_00747"},"language":"en","primary_location":{"id":"doi:10.1162/tacl_a_00747","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00747","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00747/2514611/tacl_a_00747.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00747/2514611/tacl_a_00747.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5012640016","display_name":"David A. Haslett","orcid":null},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]},{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"David A. Haslett","raw_affiliation_strings":["The Hong Kong University of Science and Technology, Hong Kong, China. haslett@ust.hk"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology, Hong Kong, China. haslett@ust.hk","institution_ids":["https://openalex.org/I200769079","https://openalex.org/I889458895"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5077498783","display_name":"Zhenguang G. Cai","orcid":"https://orcid.org/0000-0002-4097-6038"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenguang G. Cai","raw_affiliation_strings":["The Chinese University of Hong Kong, Hong Kong, China. zhenguangcai@cuhk.edu.hk"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong, Hong Kong, China. zhenguangcai@cuhk.edu.hk","institution_ids":["https://openalex.org/I177725633"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5012640016"],"corresponding_institution_ids":["https://openalex.org/I200769079","https://openalex.org/I889458895"],"apc_list":null,"apc_paid":null,"fwci":2.4253,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.89044732,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"13","issue":null,"first_page":"408","last_page":"423"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8852521181106567},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.620754063129425},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5967260003089905},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5387502908706665},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3444878160953522}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8852521181106567},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.620754063129425},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5967260003089905},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5387502908706665},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3444878160953522}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1162/tacl_a_00747","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00747","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00747/2514611/tacl_a_00747.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-151524","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-151524","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"}],"best_oa_location":{"id":"doi:10.1162/tacl_a_00747","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00747","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00747/2514611/tacl_a_00747.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.7400000095367432,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4409872456.pdf","grobid_xml":"https://content.openalex.org/works/W4409872456.grobid-xml"},"referenced_works_count":77,"referenced_works":["https://openalex.org/W46679369","https://openalex.org/W309335912","https://openalex.org/W1523296404","https://openalex.org/W1854884267","https://openalex.org/W1966787580","https://openalex.org/W1997161938","https://openalex.org/W2005665919","https://openalex.org/W2015209878","https://openalex.org/W2070296328","https://openalex.org/W2077460999","https://openalex.org/W2101234009","https://openalex.org/W2115054880","https://openalex.org/W2117621558","https://openalex.org/W2156261631","https://openalex.org/W2159230047","https://openalex.org/W2243751648","https://openalex.org/W2343872880","https://openalex.org/W2493916176","https://openalex.org/W2525539774","https://openalex.org/W2525778437","https://openalex.org/W2550038160","https://openalex.org/W2781528640","https://openalex.org/W2882319491","https://openalex.org/W2896457183","https://openalex.org/W2897879535","https://openalex.org/W2907162900","https://openalex.org/W2952638691","https://openalex.org/W2962784628","https://openalex.org/W2979401726","https://openalex.org/W2998554035","https://openalex.org/W3016107616","https://openalex.org/W3094540663","https://openalex.org/W3101140821","https://openalex.org/W3115690768","https://openalex.org/W3121904249","https://openalex.org/W3176893837","https://openalex.org/W4226271314","https://openalex.org/W4226278401","https://openalex.org/W4280653060","https://openalex.org/W4285169952","https://openalex.org/W4285267582","https://openalex.org/W4287854950","https://openalex.org/W4291991132","https://openalex.org/W4294170691","https://openalex.org/W4297801177","https://openalex.org/W4360836968","https://openalex.org/W4360951492","https://openalex.org/W4378498597","https://openalex.org/W4380360441","https://openalex.org/W4384615785","https://openalex.org/W4385570317","https://openalex.org/W4387395714","https://openalex.org/W4387575740","https://openalex.org/W4389524534","https://openalex.org/W4395064864","https://openalex.org/W4401042633","https://openalex.org/W4403363055","https://openalex.org/W6601894380","https://openalex.org/W6610777176","https://openalex.org/W6631433651","https://openalex.org/W6639872587","https://openalex.org/W6675354045","https://openalex.org/W6682691769","https://openalex.org/W6697305490","https://openalex.org/W6748304040","https://openalex.org/W6755207826","https://openalex.org/W6769430610","https://openalex.org/W6782879696","https://openalex.org/W6810728803","https://openalex.org/W6810738896","https://openalex.org/W6842433381","https://openalex.org/W6850936240","https://openalex.org/W6853051854","https://openalex.org/W6854318285","https://openalex.org/W6864618547","https://openalex.org/W6873087791","https://openalex.org/W6893895083"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Abstract":[0],"Large":[1],"language":[2],"models":[3,15],"segment":[4],"many":[5],"words":[6,36,52],"into":[7],"multiple":[8],"tokens,":[9],"and":[10,80,131],"companies":[11],"that":[12,17,51,68,81,93,115,132,140],"make":[13],"those":[14],"claim":[16],"meaningful":[18],"subword":[19,26],"tokens":[20,27,54,69,82,103,117],"are":[21,55,104],"essential.":[22],"To":[23],"investigate":[24],"whether":[25],"bear":[28],"meaning,":[29],"we":[30],"segmented":[31],"tens":[32],"of":[33,35,39,46,124,139],"thousands":[34],"from":[37,65],"each":[38],"41":[40],"languages":[41,92],"according":[42],"to":[43,118],"three":[44],"generations":[45],"GPT":[47],"tokenizers.":[48],"We":[49],"found":[50],"sharing":[53],"more":[56,84],"semantically":[57],"similar":[58],"than":[59,86,98],"expected":[60,64],"by":[61,89],"chance":[62],"or":[63],"length":[66],"alone,":[67],"capture":[70,83,137],"morphological":[71],"information":[72,85,126],"even":[73],"when":[74],"they":[75],"don\u2019t":[76],"look":[77],"like":[78],"morphemes,":[79],"is":[87],"explained":[88],"morphology.":[90],"In":[91],"use":[94],"a":[95],"script":[96],"other":[97],"the":[99,121],"Latin":[100],"alphabet,":[101],"GPT-4":[102],"uninformative,":[105],"but":[106],"GPT-4o":[107],"has":[108],"improved":[109],"this":[110],"situation.":[111],"These":[112],"results":[113],"suggest":[114],"comparing":[116],"morphemes":[119],"overlooks":[120],"wider":[122],"variety":[123],"semantic":[125],"available":[127],"in":[128],"word":[129],"form":[130],"standard":[133],"tokenization":[134],"methods":[135],"successfully":[136],"much":[138],"information.":[141]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-02T15:55:50.835912","created_date":"2025-10-10T00:00:00"}
