{"id":"https://openalex.org/W7111348468","doi":"https://doi.org/10.48550/arxiv.2512.06169","title":"Morphologically-Informed Tokenizers for Languages with Non-Concatenative Morphology: A case study of Yolox\u00f3chtil Mixtec ASR","display_name":"Morphologically-Informed Tokenizers for Languages with Non-Concatenative Morphology: A case study of Yolox\u00f3chtil Mixtec ASR","publication_year":2025,"publication_date":"2025-12-05","ids":{"openalex":"https://openalex.org/W7111348468","doi":"https://doi.org/10.48550/arxiv.2512.06169"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2512.06169","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.06169","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2512.06169","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Crawford, Chris","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Crawford, Chris","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6492000222206116,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6492000222206116,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.1388999968767166,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.11219999939203262,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.5486999750137329},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5303000211715698},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4629000127315521},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.413100004196167},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.4065000116825104},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.3928000032901764},{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.39089998602867126},{"id":"https://openalex.org/keywords/downstream","display_name":"Downstream (manufacturing)","score":0.3653999865055084},{"id":"https://openalex.org/keywords/character","display_name":"Character (mathematics)","score":0.36250001192092896}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8051999807357788},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5800999999046326},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5579000115394592},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.5486999750137329},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5467000007629395},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5303000211715698},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4629000127315521},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.413100004196167},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.4065000116825104},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.3928000032901764},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.39089998602867126},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.3653999865055084},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.36250001192092896},{"id":"https://openalex.org/C158622935","wikidata":"https://www.wikidata.org/wiki/Q660848","display_name":"Nonlinear system","level":2,"score":0.3377000093460083},{"id":"https://openalex.org/C98501671","wikidata":"https://www.wikidata.org/wiki/Q1948408","display_name":"Text segmentation","level":3,"score":0.335999995470047},{"id":"https://openalex.org/C2983335612","wikidata":"https://www.wikidata.org/wiki/Q54277","display_name":"Word processing","level":2,"score":0.32580000162124634},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.31360000371932983},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C103088060","wikidata":"https://www.wikidata.org/wiki/Q1062839","display_name":"Error detection and correction","level":2,"score":0.2838999927043915},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.2694000005722046},{"id":"https://openalex.org/C125932096","wikidata":"https://www.wikidata.org/wiki/Q205472","display_name":"Zipf's law","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.26179999113082886},{"id":"https://openalex.org/C3018824978","wikidata":"https://www.wikidata.org/wiki/Q2894891","display_name":"Error analysis","level":2,"score":0.26109999418258667},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.25429999828338623}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2512.06169","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.06169","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2512.06169","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.06169","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.7061120867729187,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"This":[0],"paper":[1],"investigates":[2],"the":[3,13,35,42,81,96,129,145,176,199],"impact":[4],"of":[5,17,21,28,37,44,71,90,137,179,201],"using":[6,25],"morphologically-informed":[7],"tokenizers":[8,120,134,154,172,203],"to":[9,105,160,197],"aid":[10],"and":[11,30,76,108,125,128,157,187],"streamline":[12],"interlinear":[14],"gloss":[15],"annotation":[16],"an":[18,101],"audio":[19],"corpus":[20],"Yolox\u00f3chitl":[22],"Mixtec":[23],"(YM)":[24],"a":[26,45,58,74,88,112,180],"combination":[27],"ASR":[29,103],"text-based":[31],"sequence-to-sequence":[32],"tools,":[33],"with":[34,123,164,184],"goal":[36],"improving":[38],"efficiency":[39],"while":[40],"reducing":[41],"workload":[43],"human":[46],"annotator.":[47],"We":[48,115],"present":[49],"two":[50],"novel":[51,119],"tokenization":[52],"schemes":[53],"that":[54,117,170],"separate":[55],"words":[56],"in":[57,111,135,204],"nonlinear":[59,171],"manner,":[60],"preserving":[61],"information":[62],"about":[63],"tonal":[64],"morphology":[65,178],"as":[66,68],"much":[67],"possible.":[69],"One":[70],"these":[72,118,202],"approaches,":[73],"Segment":[75],"Melody":[77],"tokenizer,":[78,92],"simply":[79],"extracts":[80],"tones":[82],"without":[83],"predicting":[84],"segmentation.":[85],"The":[86],"other,":[87],"Sequence":[89],"Processes":[91],"predicts":[93],"segmentation":[94],"for":[95,175,190],"words,":[97],"which":[98],"could":[99],"allow":[100],"end-to-end":[102],"system":[104],"produce":[106],"segmented":[107],"unsegmented":[109],"transcriptions":[110],"single":[113],"pass.":[114],"find":[116,161],"are":[121,182],"competitive":[122,183],"BPE":[124,186],"Unigram":[126,188],"models,":[127],"Segment-and-Melody":[130],"model":[131],"outperforms":[132],"traditional":[133],"terms":[136],"word":[138],"error":[139,148],"rate":[140],"but":[141],"does":[142],"not":[143],"reach":[144],"same":[146],"character":[147],"rate.":[149],"In":[150],"addition,":[151],"we":[152],"analyze":[153],"on":[155],"morphological":[156],"information-theoretic":[158],"metrics":[159],"predictive":[162],"correlations":[163],"downstream":[165,205],"performance.":[166],"Our":[167],"results":[168],"suggest":[169],"designed":[173],"specifically":[174],"non-concatenative":[177],"language":[181],"conventional":[185],"models":[189],"ASR.":[191],"Further":[192],"research":[193],"will":[194],"be":[195],"necessary":[196],"determine":[198],"applicability":[200],"processing":[206],"tasks.":[207]},"counts_by_year":[],"updated_date":"2025-12-10T02:49:46.989445","created_date":"2025-12-10T00:00:00"}
