{"id":"https://openalex.org/W4377116057","doi":"https://doi.org/10.1162/tacl_a_00560","title":"Sub-Character Tokenization for Chinese Pretrained Language Models","display_name":"Sub-Character Tokenization for Chinese Pretrained Language Models","publication_year":2023,"publication_date":"2023-05-18","ids":{"openalex":"https://openalex.org/W4377116057","doi":"https://doi.org/10.1162/tacl_a_00560"},"language":"en","primary_location":{"id":"doi:10.1162/tacl_a_00560","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00560","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00560/2102899/tacl_a_00560.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00560/2102899/tacl_a_00560.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112665488","display_name":"Chenglei Si","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]},{"id":"https://openalex.org/I66946132","display_name":"University of Maryland, College Park","ror":"https://ror.org/047s2c258","country_code":"US","type":"education","lineage":["https://openalex.org/I66946132"]}],"countries":["CN","US"],"is_corresponding":true,"raw_author_name":"Chenglei Si","raw_affiliation_strings":["NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. clsi@terpmail.umd.edu","University of Maryland, College Park, MD, USA. clsi@terpmail.umd.edu"],"affiliations":[{"raw_affiliation_string":"NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. clsi@terpmail.umd.edu","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"University of Maryland, College Park, MD, USA. clsi@terpmail.umd.edu","institution_ids":["https://openalex.org/I66946132"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101584823","display_name":"Zhengyan Zhang","orcid":"https://orcid.org/0000-0003-2988-0083"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhengyan Zhang","raw_affiliation_strings":["NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. zy-z19@mails.tsinghua.edu.cn"],"affiliations":[{"raw_affiliation_string":"NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. zy-z19@mails.tsinghua.edu.cn","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004381973","display_name":"Yingfa Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yingfa Chen","raw_affiliation_strings":["NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. yingfa-c18@mails.tsinghua.edu.cn"],"affiliations":[{"raw_affiliation_string":"NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. yingfa-c18@mails.tsinghua.edu.cn","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030183493","display_name":"Fanchao Qi","orcid":"https://orcid.org/0000-0002-4400-4033"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Fanchao Qi","raw_affiliation_strings":["NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. qfc17@mails.tsinghua.edu.cn"],"affiliations":[{"raw_affiliation_string":"NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. qfc17@mails.tsinghua.edu.cn","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100775958","display_name":"Xiaozhi Wang","orcid":"https://orcid.org/0000-0002-5727-143X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xiaozhi Wang","raw_affiliation_strings":["NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. wangxz20@mails.tsinghua.edu.cn"],"affiliations":[{"raw_affiliation_string":"NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. wangxz20@mails.tsinghua.edu.cn","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100320723","display_name":"Zhiyuan Liu","orcid":"https://orcid.org/0000-0002-7709-2543"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhiyuan Liu","raw_affiliation_strings":["NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. liuzy@tsinghua.edu.cn"],"affiliations":[{"raw_affiliation_string":"NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. liuzy@tsinghua.edu.cn","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115592503","display_name":"Yasheng Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yasheng Wang","raw_affiliation_strings":["Huawei Noah\u2019s Ark Lab, Hong Kong, China. wangyasheng@huawei.com"],"affiliations":[{"raw_affiliation_string":"Huawei Noah\u2019s Ark Lab, Hong Kong, China. wangyasheng@huawei.com","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100426155","display_name":"Qun Liu","orcid":"https://orcid.org/0000-0002-1179-290X"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Qun Liu","raw_affiliation_strings":["Huawei Noah\u2019s Ark Lab, Hong Kong, China. qun.liu@huawei.com"],"affiliations":[{"raw_affiliation_string":"Huawei Noah\u2019s Ark Lab, Hong Kong, China. qun.liu@huawei.com","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5046448314","display_name":"Maosong Sun","orcid":"https://orcid.org/0000-0002-6011-6115"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Maosong Sun","raw_affiliation_strings":["NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. sms@tsinghua.edu.cn"],"affiliations":[{"raw_affiliation_string":"NLP Group, DCST, IAI, BNRIST, Tsinghua University, Beijing, China. sms@tsinghua.edu.cn","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5004381973","https://openalex.org/A5030183493","https://openalex.org/A5046448314","https://openalex.org/A5100320723","https://openalex.org/A5100426155","https://openalex.org/A5100775958","https://openalex.org/A5101584823","https://openalex.org/A5112665488","https://openalex.org/A5115592503"],"corresponding_institution_ids":["https://openalex.org/I2250955327","https://openalex.org/I66946132","https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":2.4646,"has_fulltext":true,"cited_by_count":14,"citation_normalized_percentile":{"value":0.9103608,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"11","issue":null,"first_page":"469","last_page":"487"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.9033612608909607},{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.8711994886398315},{"id":"https://openalex.org/keywords/pronunciation","display_name":"Pronunciation","score":0.6787282228469849},{"id":"https://openalex.org/keywords/homophone","display_name":"Homophone","score":0.6386653780937195},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6368923187255859},{"id":"https://openalex.org/keywords/character","display_name":"Character (mathematics)","score":0.6310688853263855},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6097263693809509},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.48028039932250977},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.47091054916381836},{"id":"https://openalex.org/keywords/text-segmentation","display_name":"Text segmentation","score":0.4696677625179291},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4471196234226227},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4429466128349304},{"id":"https://openalex.org/keywords/pinyin","display_name":"Pinyin","score":0.4317699670791626},{"id":"https://openalex.org/keywords/chinese-characters","display_name":"Chinese characters","score":0.38490426540374756},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.3141787648200989},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.19100961089134216}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9033612608909607},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.8711994886398315},{"id":"https://openalex.org/C2780844864","wikidata":"https://www.wikidata.org/wiki/Q184377","display_name":"Pronunciation","level":2,"score":0.6787282228469849},{"id":"https://openalex.org/C160253069","wikidata":"https://www.wikidata.org/wiki/Q221079","display_name":"Homophone","level":2,"score":0.6386653780937195},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6368923187255859},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.6310688853263855},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6097263693809509},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.48028039932250977},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.47091054916381836},{"id":"https://openalex.org/C98501671","wikidata":"https://www.wikidata.org/wiki/Q1948408","display_name":"Text segmentation","level":3,"score":0.4696677625179291},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4471196234226227},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4429466128349304},{"id":"https://openalex.org/C2781095461","wikidata":"https://www.wikidata.org/wiki/Q42222","display_name":"Pinyin","level":3,"score":0.4317699670791626},{"id":"https://openalex.org/C2781051154","wikidata":"https://www.wikidata.org/wiki/Q8201","display_name":"Chinese characters","level":2,"score":0.38490426540374756},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.3141787648200989},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.19100961089134216},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1162/tacl_a_00560","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00560","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00560/2102899/tacl_a_00560.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:5e1c7178b0ed4851a6922b1218a03d89","is_oa":true,"landing_page_url":"https://doaj.org/article/5e1c7178b0ed4851a6922b1218a03d89","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Transactions of the Association for Computational Linguistics, Vol 11, Pp 469-487 (2023)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1162/tacl_a_00560","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00560","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00560/2102899/tacl_a_00560.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.8100000023841858,"id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G1121271761","display_name":null,"funder_award_id":"Program","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1614471940","display_name":null,"funder_award_id":"2020AAA0","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2565984771","display_name":null,"funder_award_id":"62236004","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G261026068","display_name":null,"funder_award_id":"2020AAA0106500","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G391238517","display_name":null,"funder_award_id":", and","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6449033798","display_name":null,"funder_award_id":"NSFC No.","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4377116057.pdf"},"referenced_works_count":67,"referenced_works":["https://openalex.org/W2010576059","https://openalex.org/W2121879602","https://openalex.org/W2131988669","https://openalex.org/W2559281960","https://openalex.org/W2560939934","https://openalex.org/W2759366113","https://openalex.org/W2788009253","https://openalex.org/W2889968917","https://openalex.org/W2897356710","https://openalex.org/W2938830017","https://openalex.org/W2944852028","https://openalex.org/W2949884065","https://openalex.org/W2951286828","https://openalex.org/W2962739339","https://openalex.org/W2962784628","https://openalex.org/W2963250244","https://openalex.org/W2963979492","https://openalex.org/W2964144280","https://openalex.org/W2965373594","https://openalex.org/W2971871542","https://openalex.org/W2986753851","https://openalex.org/W3014328670","https://openalex.org/W3015253856","https://openalex.org/W3021636956","https://openalex.org/W3023143096","https://openalex.org/W3095771422","https://openalex.org/W3098065087","https://openalex.org/W3099911888","https://openalex.org/W3101918878","https://openalex.org/W3102725307","https://openalex.org/W3106031450","https://openalex.org/W3124386064","https://openalex.org/W3135427360","https://openalex.org/W3164045210","https://openalex.org/W3170962005","https://openalex.org/W3171008115","https://openalex.org/W3171291687","https://openalex.org/W3173175058","https://openalex.org/W3174396451","https://openalex.org/W3174418826","https://openalex.org/W3176617251","https://openalex.org/W3177365697","https://openalex.org/W3187018546","https://openalex.org/W3210120707","https://openalex.org/W4285208773","https://openalex.org/W4303647954","https://openalex.org/W4385570900","https://openalex.org/W6631435780","https://openalex.org/W6635306185","https://openalex.org/W6635469476","https://openalex.org/W6682691769","https://openalex.org/W6736595860","https://openalex.org/W6739901393","https://openalex.org/W6755207826","https://openalex.org/W6758737940","https://openalex.org/W6761910064","https://openalex.org/W6765039553","https://openalex.org/W6766673545","https://openalex.org/W6767852069","https://openalex.org/W6768021236","https://openalex.org/W6771917389","https://openalex.org/W6779068807","https://openalex.org/W6785415556","https://openalex.org/W6789719078","https://openalex.org/W6802852670","https://openalex.org/W6845834095","https://openalex.org/W6850258700"],"related_works":["https://openalex.org/W2360372934","https://openalex.org/W4385493412","https://openalex.org/W3159297355","https://openalex.org/W1575804004","https://openalex.org/W2386864954","https://openalex.org/W380382270","https://openalex.org/W3123716854","https://openalex.org/W1788792945","https://openalex.org/W2101906057","https://openalex.org/W2371267009"],"abstract_inverted_index":{"Abstract":[0],"Tokenization":[1],"is":[2],"fundamental":[3],"to":[4,143,168],"pretrained":[5],"language":[6],"models":[7,150,165],"(PLMs).":[8],"Existing":[9],"tokenization":[10,138],"methods":[11],"for":[12,56],"Chinese":[13,31,69,127],"PLMs":[14],"typically":[15],"treat":[16],"each":[17,68],"character":[18,41,70],"as":[19],"an":[20],"indivisible":[21],"token.":[22],"However,":[23],"they":[24],"ignore":[25],"the":[26,30,40,45,63,84,88,118,130,136,147],"unique":[27],"feature":[28],"of":[29],"writing":[32],"system":[33],"where":[34],"additional":[35],"linguistic":[36],"information":[37],"exists":[38],"below":[39],"level,":[42],"i.e.,":[43],"at":[44,166],"sub-character":[46,54],"level.":[47],"To":[48],"utilize":[49],"such":[50],"information,":[51],"we":[52,60],"propose":[53],"(SubChar":[55],"short)":[57],"tokenization.":[58],"Specifically,":[59],"first":[61],"encode":[62,126],"input":[64],"text":[65,90],"by":[66],"converting":[67],"into":[71,112,129],"a":[72],"short":[73],"sequence":[74],"based":[75,86],"on":[76,87,157],"its":[77],"glyph":[78],"or":[79],"pronunciation,":[80],"and":[81,134,164],"then":[82],"construct":[83],"vocabulary":[85],"encoded":[89],"with":[91,152],"sub-word":[92],"segmentation.":[93],"Experimental":[94],"results":[95],"show":[96],"that":[97],"SubChar":[98,123,153],"tokenizers":[99,124,154],"have":[100],"two":[101],"main":[102],"advantages":[103],"over":[104],"existing":[105],"tokenizers:":[106],"1)":[107],"They":[108],"can":[109,125],"tokenize":[110],"inputs":[111],"much":[113],"shorter":[114],"sequences,":[115],"thus":[116],"improving":[117],"computational":[119],"efficiency.":[120],"2)":[121],"Pronunciation-based":[122],"homophones":[128],"same":[131,137,148],"transliteration":[132],"sequences":[133],"produce":[135],"output,":[139],"hence":[140],"being":[141],"robust":[142],"homophone":[144],"typos.":[145],"At":[146],"time,":[149],"trained":[151],"perform":[155],"competitively":[156],"downstream":[158],"tasks.":[159],"We":[160],"release":[161],"our":[162],"code":[163],"https://github.com/thunlp/SubCharTokenization":[167],"facilitate":[169],"future":[170],"work.":[171]},"counts_by_year":[{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":3}],"updated_date":"2026-03-18T14:38:29.013473","created_date":"2025-10-10T00:00:00"}
