{"id":"https://openalex.org/W2885185669","doi":"https://doi.org/10.18653/v1/d18-2012","title":"SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing","display_name":"SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing","publication_year":2018,"publication_date":"2018-01-01","ids":{"openalex":"https://openalex.org/W2885185669","doi":"https://doi.org/10.18653/v1/d18-2012","mag":"2885185669"},"language":"en","primary_location":{"id":"doi:10.18653/v1/d18-2012","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/d18-2012","pdf_url":"https://www.aclweb.org/anthology/D18-2012.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.aclweb.org/anthology/D18-2012.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103575803","display_name":"Taku Kudo","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Taku Kudo","raw_affiliation_strings":["Google, Inc","Google (United States), Mountain View, United States"],"affiliations":[{"raw_affiliation_string":"Google, Inc","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Google (United States), Mountain View, United States","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5024427669","display_name":"John T. E. Richardson","orcid":"https://orcid.org/0000-0002-6267-0603"},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]},{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["JP","US"],"is_corresponding":false,"raw_author_name":"John Richardson","raw_affiliation_strings":["Google, Inc","Kyoto University, Kyoto, Japan"],"affiliations":[{"raw_affiliation_string":"Google, Inc","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Kyoto University, Kyoto, Japan","institution_ids":["https://openalex.org/I22299242"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5103575803"],"corresponding_institution_ids":["https://openalex.org/I1291425158"],"apc_list":null,"apc_paid":null,"fwci":33.9518,"has_fulltext":true,"cited_by_count":286,"citation_normalized_percentile":{"value":0.99713902,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"66","last_page":"71"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9902999997138977,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.853082001209259},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.7314148545265198},{"id":"https://openalex.org/keywords/python","display_name":"Python (programming language)","score":0.6840686798095703},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6598066091537476},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5877358317375183},{"id":"https://openalex.org/keywords/license","display_name":"License","score":0.49837827682495117},{"id":"https://openalex.org/keywords/implementation","display_name":"Implementation","score":0.45044979453086853},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.4416269063949585},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.42415809631347656},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.41787081956863403},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.34613025188446045},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.3391602039337158}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.853082001209259},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.7314148545265198},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.6840686798095703},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6598066091537476},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5877358317375183},{"id":"https://openalex.org/C2780560020","wikidata":"https://www.wikidata.org/wiki/Q79719","display_name":"License","level":2,"score":0.49837827682495117},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.45044979453086853},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.4416269063949585},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.42415809631347656},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.41787081956863403},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.34613025188446045},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3391602039337158},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C105580179","wikidata":"https://www.wikidata.org/wiki/Q188928","display_name":"Messenger RNA","level":3,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.18653/v1/d18-2012","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/d18-2012","pdf_url":"https://www.aclweb.org/anthology/D18-2012.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1808.06226","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1808.06226","pdf_url":"https://arxiv.org/pdf/1808.06226","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:2885185669","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/1808.06226","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.1808.06226","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1808.06226","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.18653/v1/d18-2012","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/d18-2012","pdf_url":"https://www.aclweb.org/anthology/D18-2012.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.7300000190734863,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2885185669.pdf","grobid_xml":"https://content.openalex.org/works/W2885185669.grobid-xml"},"referenced_works_count":15,"referenced_works":["https://openalex.org/W1591706642","https://openalex.org/W1843891098","https://openalex.org/W1902237438","https://openalex.org/W2101105183","https://openalex.org/W2525778437","https://openalex.org/W2550821151","https://openalex.org/W2626778328","https://openalex.org/W2725082186","https://openalex.org/W2759088880","https://openalex.org/W2765961751","https://openalex.org/W2766182427","https://openalex.org/W2798362442","https://openalex.org/W2962784628","https://openalex.org/W2963979492","https://openalex.org/W2964308564"],"related_works":["https://openalex.org/W2963403868","https://openalex.org/W2963341956","https://openalex.org/W2962784628","https://openalex.org/W2964308564","https://openalex.org/W2965373594","https://openalex.org/W1522301498","https://openalex.org/W2101105183","https://openalex.org/W2936774411","https://openalex.org/W2064675550","https://openalex.org/W2327501763","https://openalex.org/W1494198834","https://openalex.org/W2787560479","https://openalex.org/W2525778437","https://openalex.org/W2130942839","https://openalex.org/W2963979492","https://openalex.org/W2127141656","https://openalex.org/W2970597249","https://openalex.org/W2250539671","https://openalex.org/W1828163288","https://openalex.org/W2963216553"],"abstract_inverted_index":{"This":[0],"paper":[1],"describes":[2],"SentencePiece,":[3],"a":[4,57,66],"language-independent":[5],"subword":[6,27,31,46,87,98],"tokenizer":[7],"and":[8,23,60,75,100],"detokenizer":[9],"designed":[10],"for":[11,26],"Neural-based":[12],"text":[13],"processing,":[14],"including":[15],"Neural":[16],"Machine":[17],"Translation.":[18],"It":[19],"provides":[20],"open-source":[21],"C++":[22],"Python":[24],"implementations":[25],"units.":[28],"While":[29],"existing":[30],"segmentation":[32,101],"tools":[33],"assume":[34],"that":[35,77],"the":[36,95],"input":[37],"is":[38,79],"pre-tokenized":[39],"into":[40],"word":[41],"sequences,":[42],"SentencePiece":[43],"can":[44],"train":[45],"models":[47],"directly":[48],"from":[49,89],"raw":[50,90],"sentences,":[51],"which":[52],"allows":[53],"us":[54],"to":[55,81,85],"make":[56],"purely":[58],"end-to-end":[59],"language":[61],"independent":[62],"system.":[63],"We":[64,92],"perform":[65],"validation":[67],"experiment":[68],"of":[69,97],"NMT":[70],"on":[71],"English-Japanese":[72],"machine":[73],"translation,":[74],"find":[76],"it":[78],"possible":[80],"achieve":[82],"comparable":[83],"accuracy":[84],"direct":[86],"training":[88,99],"sentences.":[91],"also":[93],"compare":[94],"performance":[96],"with":[102],"various":[103],"configurations.":[104]},"counts_by_year":[{"year":2025,"cited_by_count":12},{"year":2024,"cited_by_count":11},{"year":2023,"cited_by_count":35},{"year":2022,"cited_by_count":27},{"year":2021,"cited_by_count":83},{"year":2020,"cited_by_count":83},{"year":2019,"cited_by_count":31},{"year":2018,"cited_by_count":4}],"updated_date":"2026-03-06T13:50:29.536080","created_date":"2025-10-10T00:00:00"}
