{"id":"https://openalex.org/W3016107616","doi":"https://doi.org/10.1017/s1351324920000145","title":"Emerging trends: Subwords, seriously?","display_name":"Emerging trends: Subwords, seriously?","publication_year":2020,"publication_date":"2020-04-07","ids":{"openalex":"https://openalex.org/W3016107616","doi":"https://doi.org/10.1017/s1351324920000145","mag":"3016107616"},"language":"en","primary_location":{"id":"doi:10.1017/s1351324920000145","is_oa":true,"landing_page_url":"https://doi.org/10.1017/s1351324920000145","pdf_url":"https://www.cambridge.org/core/services/aop-cambridge-core/content/view/619F28526E833A6B623E6D2009F37B82/S1351324920000145a.pdf/div-class-title-emerging-trends-subwords-seriously-div.pdf","source":{"id":"https://openalex.org/S18088403","display_name":"Natural Language Engineering","issn_l":"1351-3249","issn":["1351-3249","1469-8110"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311721","host_organization_name":"Cambridge University Press","host_organization_lineage":["https://openalex.org/P4310311721","https://openalex.org/P4310311702"],"host_organization_lineage_names":["Cambridge University Press","University of Cambridge"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Natural Language Engineering","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://www.cambridge.org/core/services/aop-cambridge-core/content/view/619F28526E833A6B623E6D2009F37B82/S1351324920000145a.pdf/div-class-title-emerging-trends-subwords-seriously-div.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016543371","display_name":"Kenneth Church","orcid":"https://orcid.org/0000-0001-8378-6069"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kenneth Ward Church","raw_affiliation_strings":["Baidu, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Baidu, USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5016543371"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.544,"has_fulltext":true,"cited_by_count":14,"citation_normalized_percentile":{"value":0.72836,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":"26","issue":"3","first_page":"375","last_page":"382"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.874038577079773},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.8221761584281921},{"id":"https://openalex.org/keywords/prefix","display_name":"Prefix","score":0.6758253574371338},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.6295337677001953},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5833565592765808},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5660201907157898},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.518005907535553},{"id":"https://openalex.org/keywords/meaning","display_name":"Meaning (existential)","score":0.4994525909423828},{"id":"https://openalex.org/keywords/byte","display_name":"Byte","score":0.4135345220565796},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.39980754256248474},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.27475428581237793},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.12213945388793945},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.06900331377983093}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.874038577079773},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.8221761584281921},{"id":"https://openalex.org/C141603448","wikidata":"https://www.wikidata.org/wiki/Q134830","display_name":"Prefix","level":2,"score":0.6758253574371338},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.6295337677001953},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5833565592765808},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5660201907157898},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.518005907535553},{"id":"https://openalex.org/C2780876879","wikidata":"https://www.wikidata.org/wiki/Q3054749","display_name":"Meaning (existential)","level":2,"score":0.4994525909423828},{"id":"https://openalex.org/C43364308","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Byte","level":2,"score":0.4135345220565796},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.39980754256248474},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.27475428581237793},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.12213945388793945},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.06900331377983093},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1017/s1351324920000145","is_oa":true,"landing_page_url":"https://doi.org/10.1017/s1351324920000145","pdf_url":"https://www.cambridge.org/core/services/aop-cambridge-core/content/view/619F28526E833A6B623E6D2009F37B82/S1351324920000145a.pdf/div-class-title-emerging-trends-subwords-seriously-div.pdf","source":{"id":"https://openalex.org/S18088403","display_name":"Natural Language Engineering","issn_l":"1351-3249","issn":["1351-3249","1469-8110"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311721","host_organization_name":"Cambridge University Press","host_organization_lineage":["https://openalex.org/P4310311721","https://openalex.org/P4310311702"],"host_organization_lineage_names":["Cambridge University Press","University of Cambridge"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Natural Language Engineering","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1017/s1351324920000145","is_oa":true,"landing_page_url":"https://doi.org/10.1017/s1351324920000145","pdf_url":"https://www.cambridge.org/core/services/aop-cambridge-core/content/view/619F28526E833A6B623E6D2009F37B82/S1351324920000145a.pdf/div-class-title-emerging-trends-subwords-seriously-div.pdf","source":{"id":"https://openalex.org/S18088403","display_name":"Natural Language Engineering","issn_l":"1351-3249","issn":["1351-3249","1469-8110"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311721","host_organization_name":"Cambridge University Press","host_organization_lineage":["https://openalex.org/P4310311721","https://openalex.org/P4310311702"],"host_organization_lineage_names":["Cambridge University Press","University of Cambridge"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Natural Language Engineering","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.6299999952316284,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3016107616.pdf","grobid_xml":"https://content.openalex.org/works/W3016107616.grobid-xml"},"referenced_works_count":6,"referenced_works":["https://openalex.org/W28707110","https://openalex.org/W2121879602","https://openalex.org/W2525778437","https://openalex.org/W2896457183","https://openalex.org/W2962784628","https://openalex.org/W2963310665"],"related_works":["https://openalex.org/W3196321793","https://openalex.org/W3080705045","https://openalex.org/W2385527937","https://openalex.org/W2005880840","https://openalex.org/W4385305499","https://openalex.org/W3173108628","https://openalex.org/W2507465767","https://openalex.org/W2373597927","https://openalex.org/W2035645574","https://openalex.org/W1005360155"],"abstract_inverted_index":{"Abstract":[0],"Subwords":[1],"have":[2],"become":[3],"very":[4],"popular,":[5],"but":[6],"the":[7,33,99,108,131,137],"BERT":[8,93],"a":[9,23,26,47,117],"and":[10,84,91,94,136],"ERNIE":[11,95],"b":[12],"tokenizers":[13],"often":[14],"produce":[15],"surprising":[16],"results.":[17],"Byte":[18],"pair":[19],"encoding":[20],"(BPE)":[21],"trains":[22],"dictionary":[24,48],"with":[25,101],"simple":[27],"information":[28],"theoretic":[29],"criterion":[30],"that":[31],"sidesteps":[32],"need":[34],"for":[35,143,150],"special":[36],"treatment":[37],"of":[38,49,110,119],"unknown":[39,56],"words.":[40],"BPE":[41],"is":[42,141,148],"more":[43,102],"about":[44],"training":[45],"(populating":[46],"word":[50,57,59,103,111],"pieces)":[51],"than":[52],"inference":[53,64],"(parsing":[54],"an":[55],"into":[58],"pieces).":[60],"The":[61,127],"parse":[62,70,100],"at":[63],"time":[65],"can":[66,77,86],"be":[67,78,87,122],"ambiguous.":[68],"Which":[69],"should":[71],"we":[72],"use?":[73],"For":[74],"example,":[75],"\u201celectroneutral\u201d":[76],"parsed":[79,88],"as":[80,89],"electron-eu-tral":[81],"or":[82],"electro-neutral,":[83],"\u201cbidirectional\u201d":[85],"bid-ire-ction-al":[90],"bi-directional.":[92],"tend":[96],"to":[97],"favor":[98],"pieces.":[104,112],"We":[105],"propose":[106],"minimizing":[107],"number":[109,118],"To":[113],"justify":[114],"our":[115],"proposal,":[116],"criteria":[120],"will":[121],"considered:":[123],"sound,":[124],"meaning,":[125],"etc.":[126],"prefix,":[128],"bi-,":[129],"has":[130],"desired":[132,138],"vowel":[133],"(unlike":[134],"bid)":[135],"meaning":[139],"(bi":[140],"Latin":[142],"two,":[144],"unlike":[145],"bid,":[146],"which":[147],"Germanic":[149],"offer).":[151]},"counts_by_year":[{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":1}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
