{"id":"https://openalex.org/W7154225534","doi":"https://doi.org/10.48550/arxiv.2604.11110","title":"Ti-Audio: The First Multi-Dialectal End-to-End Speech LLM for Tibetan","display_name":"Ti-Audio: The First Multi-Dialectal End-to-End Speech LLM for Tibetan","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154225534","doi":"https://doi.org/10.48550/arxiv.2604.11110"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11110","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11110","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11110","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133610787","display_name":"Jialing Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jialing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133593914","display_name":"Yue Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133586413","display_name":"Yuhao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yuhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133576295","display_name":"Jing Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133623568","display_name":"Shaosai Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Shaosai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133580151","display_name":"Zhanchen Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Zhanchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133561308","display_name":"Benyou Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Benyou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133557316","display_name":"Haizhou Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haizhou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8679999709129333,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8679999709129333,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.02810000069439411,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.013899999670684338,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scarcity","display_name":"Scarcity","score":0.6021000146865845},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5598000288009644},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.42910000681877136},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.3246999979019165},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.32420000433921814},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.3009999990463257}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7243000268936157},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6128000020980835},{"id":"https://openalex.org/C109747225","wikidata":"https://www.wikidata.org/wiki/Q815758","display_name":"Scarcity","level":2,"score":0.6021000146865845},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5598000288009644},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.42910000681877136},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4221000075340271},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4020000100135803},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.3246999979019165},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.32420000433921814},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3009999990463257},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.29420000314712524},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.2816999852657318},{"id":"https://openalex.org/C177284502","wikidata":"https://www.wikidata.org/wiki/Q1005390","display_name":"Adapter (computing)","level":2,"score":0.27070000767707825},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.2572000026702881},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.25609999895095825}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11110","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11110","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11110","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11110","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"display_name":"Partnerships for the goals","score":0.41352444887161255,"id":"https://metadata.un.org/sdg/17"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,19,153],"Speech":[3],"Large":[4],"Language":[5],"Models":[6],"(Speech-LLMs)":[7],"have":[8],"made":[9],"significant":[10],"progress,":[11],"greatly":[12],"enhancing":[13],"multimodal":[14],"interaction":[15],"capabilities.However,":[16],"their":[17],"application":[18],"low-resource":[20,154],"and":[21,44,68,107,132,143],"dialect-diverse":[22],"environments":[23],"still":[24],"faces":[25],"challenges.":[26],"The":[27],"severe":[28],"scarcity":[29,106],"of":[30,50,140,151],"Tibetan":[31,126],"data,":[32],"coupled":[33],"with":[34,89],"the":[35,57,93,138,149],"phonetic":[36],"differences":[37],"among":[38,100],"its":[39],"major":[40],"dialects":[41,102],"(\u00dc-Tsang,":[42],"Amdo,":[43],"Kham),":[45],"is":[46],"a":[47,72,109,145],"prime":[48],"example":[49],"this":[51,115],"challenge.":[52],"This":[53],"paper":[54],"proposes":[55],"Ti-Audio,":[56],"first":[58],"multi-dialectal":[59],"end-to-end":[60],"Speech-LLM":[61,152],"for":[62,128,148],"Tibetan.":[63],"To":[64],"efficiently":[65],"align":[66],"speech":[67,130,133],"text,":[69],"we":[70,96],"introduce":[71],"Dynamic":[73],"Q-Former":[74],"Adapter":[75],"that":[76,120],"extracts":[77],"essential":[78],"acoustic":[79],"features":[80],"from":[81],"variable-length":[82],"speech,":[83],"ensuring":[84],"stable":[85],"cross-modal":[86],"alignment":[87],"even":[88],"limited":[90],"data.":[91],"At":[92],"data":[94,105],"level,":[95],"leverage":[97],"mutual":[98],"assistance":[99],"related":[101],"to":[103,113],"alleviate":[104],"employ":[108],"temperature-based":[110],"sampling":[111],"strategy":[112],"maximize":[114],"synergy.":[116],"Experimental":[117],"results":[118],"demonstrate":[119],"Ti-Audio":[121],"achieves":[122],"state-of-the-art":[123],"performance":[124],"on":[125],"benchmarks":[127],"automatic":[129],"recognition":[131],"translation.":[134],"Our":[135],"work":[136],"validates":[137],"effectiveness":[139],"cross-dialectal":[141],"cooperation":[142],"provides":[144],"scalable":[146],"paradigm":[147],"development":[150],"scenarios.":[155]},"counts_by_year":[],"updated_date":"2026-07-01T08:55:40.977307","created_date":"2026-04-15T00:00:00"}
