{"id":"https://openalex.org/W4320503340","doi":"https://doi.org/10.48550/arxiv.2212.07284","title":"MANTa: Efficient Gradient-Based Tokenization for Robust End-to-End Language Modeling","display_name":"MANTa: Efficient Gradient-Based Tokenization for Robust End-to-End Language Modeling","publication_year":2022,"publication_date":"2022-12-14","ids":{"openalex":"https://openalex.org/W4320503340","doi":"https://doi.org/10.48550/arxiv.2212.07284"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2212.07284","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2212.07284","pdf_url":"https://arxiv.org/pdf/2212.07284","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2212.07284","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071654594","display_name":"Nathan Godey","orcid":"https://orcid.org/0000-0001-8319-8128"},"institutions":[{"id":"https://openalex.org/I39804081","display_name":"Sorbonne Universit\u00e9","ror":"https://ror.org/02en5vm52","country_code":"FR","type":"education","lineage":["https://openalex.org/I39804081"]}],"countries":["FR"],"is_corresponding":true,"raw_author_name":"Godey, Nathan","raw_affiliation_strings":["SU - Sorbonne Universite\u0301 (21 rue de l\u2019\u00c9cole de m\u00e9decine - 75006 Paris - France)","ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SU - Sorbonne Universite\u0301 (21 rue de l\u2019\u00c9cole de m\u00e9decine - 75006 Paris - France)","institution_ids":["https://openalex.org/I39804081"]},{"raw_affiliation_string":"ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049952129","display_name":"Roman Castagn\u00e9","orcid":null},"institutions":[{"id":"https://openalex.org/I39804081","display_name":"Sorbonne Universit\u00e9","ror":"https://ror.org/02en5vm52","country_code":"FR","type":"education","lineage":["https://openalex.org/I39804081"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Castagn\u00e9, Roman","raw_affiliation_strings":["SU - Sorbonne Universite\u0301 (21 rue de l\u2019\u00c9cole de m\u00e9decine - 75006 Paris - France)","ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SU - Sorbonne Universite\u0301 (21 rue de l\u2019\u00c9cole de m\u00e9decine - 75006 Paris - France)","institution_ids":["https://openalex.org/I39804081"]},{"raw_affiliation_string":"ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037164199","display_name":"\u00c9ric Villemonte de la Clergerie","orcid":"https://orcid.org/0000-0001-6428-9219"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"de la Clergerie, \u00c9ric","raw_affiliation_strings":["ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5077663332","display_name":"Beno\u00eet Sagot","orcid":"https://orcid.org/0000-0002-0107-8526"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sagot, Beno\u00eet","raw_affiliation_strings":["ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5071654594"],"corresponding_institution_ids":["https://openalex.org/I39804081"],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9836000204086304,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8557645082473755},{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.8294929265975952},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7533503770828247},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.645805835723877},{"id":"https://openalex.org/keywords/byte","display_name":"Byte","score":0.6154047846794128},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4850258231163025},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.4460234045982361},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.42653876543045044},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.34699928760528564},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.1994885504245758}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8557645082473755},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.8294929265975952},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7533503770828247},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.645805835723877},{"id":"https://openalex.org/C43364308","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Byte","level":2,"score":0.6154047846794128},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4850258231163025},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.4460234045982361},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.42653876543045044},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34699928760528564},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.1994885504245758},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2212.07284","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2212.07284","pdf_url":"https://arxiv.org/pdf/2212.07284","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},{"id":"doi:10.48550/arxiv.2212.07284","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2212.07284","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2212.07284","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2212.07284","pdf_url":"https://arxiv.org/pdf/2212.07284","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4385571757","https://openalex.org/W4281478209","https://openalex.org/W2916997151","https://openalex.org/W4378498597","https://openalex.org/W2949174760","https://openalex.org/W3101140821","https://openalex.org/W4287816966","https://openalex.org/W3015650676","https://openalex.org/W4389520445","https://openalex.org/W4387800341"],"abstract_inverted_index":{"Static":[0],"subword":[1,73],"tokenization":[2],"algorithms":[3],"have":[4],"been":[5],"an":[6,85],"essential":[7],"component":[8],"of":[9,63,69,88],"recent":[10],"works":[11],"on":[12,97,107,132],"language":[13,52],"modeling.":[14],"However,":[15],"their":[16],"static":[17],"nature":[18],"results":[19],"in":[20],"important":[21],"flaws":[22],"that":[23,112,125,140],"degrade":[24],"the":[25,51,61,67,133],"models'":[26],"downstream":[27],"performance":[28],"and":[29,66,119],"robustness.":[30],"In":[31,75],"this":[32],"work,":[33],"we":[34,138],"propose":[35],"MANTa,":[36],"a":[37,45,58],"Module":[38],"for":[39],"Adaptive":[40],"Neural":[41],"TokenizAtion.":[42],"MANTa":[43,113,126],"is":[44,79,142],"differentiable":[46],"tokenizer":[47,78],"trained":[48,71],"end-to-end":[49],"with":[50],"model.":[53],"The":[54],"resulting":[55],"system":[56],"offers":[57],"trade-off":[59],"between":[60],"expressiveness":[62],"byte-level":[64,147],"models":[65,70,131],"speed":[68],"using":[72],"tokenization.":[74],"addition,":[76],"our":[77,94],"highly":[80],"explainable":[81],"since":[82],"it":[83,141],"produces":[84],"explicit":[86],"segmentation":[87],"sequences":[89],"into":[90],"blocks.":[91],"We":[92,110,122],"evaluate":[93],"pre-trained":[95],"model":[96],"several":[98],"English":[99],"datasets":[100],"from":[101],"different":[102],"domains":[103],"as":[104,106],"well":[105],"synthetic":[108],"noise.":[109],"find":[111],"improves":[114],"robustness":[115],"to":[116,129],"character":[117],"perturbations":[118],"out-of-domain":[120],"data.":[121],"then":[123],"show":[124,139],"performs":[127],"comparably":[128],"other":[130],"general-domain":[134],"GLUE":[135],"benchmark.":[136],"Finally,":[137],"considerably":[143],"faster":[144],"than":[145],"strictly":[146],"models.":[148]},"counts_by_year":[],"updated_date":"2026-06-10T14:10:52.464848","created_date":"2023-02-14T00:00:00"}
