{"id":"https://openalex.org/W7131826554","doi":"https://doi.org/10.48550/arxiv.2602.23068","title":"TADA: A Generative Framework for Speech Modeling via Text-Acoustic Dual Alignment","display_name":"TADA: A Generative Framework for Speech Modeling via Text-Acoustic Dual Alignment","publication_year":2026,"publication_date":"2026-02-26","ids":{"openalex":"https://openalex.org/W7131826554","doi":"https://doi.org/10.48550/arxiv.2602.23068"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.23068","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109470009","display_name":"Trung Dinh Dang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Dang, Trung","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086812459","display_name":"Sharath S. Rao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rao, Sharath","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127060161","display_name":"Ananya Gupta","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gupta, Ananya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067816902","display_name":"Chris Gagne","orcid":"https://orcid.org/0000-0003-2241-5285"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gagne, Christopher","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045277971","display_name":"Panagiotis Tzirakis","orcid":"https://orcid.org/0000-0001-9449-5339"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tzirakis, Panagiotis","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030891676","display_name":"Alice Baird","orcid":"https://orcid.org/0000-0002-7003-5650"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Baird, Alice","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127133495","display_name":"Jakub Piotr C\u0142apa","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"C\u0142apa, Jakub Piotr","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126495879","display_name":"Peter Chin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chin, Peter","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5050693236","display_name":"A R Cowen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cowen, Alan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5109470009"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5580000281333923,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5580000281333923,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.07240000367164612,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.07069999724626541,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.542900025844574},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4945000112056732},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.4456000030040741},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4293999969959259},{"id":"https://openalex.org/keywords/context-model","display_name":"Context model","score":0.4165000021457672},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.39989998936653137},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3871999979019165},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.3864000141620636},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.37119999527931213},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.36399999260902405}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.795799970626831},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6072999835014343},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.542900025844574},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4945000112056732},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45680001378059387},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.4456000030040741},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4293999969959259},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.4165000021457672},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.39989998936653137},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3871999979019165},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.3864000141620636},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.37119999527931213},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3702000081539154},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.36399999260902405},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.3564999997615814},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.34630000591278076},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3434000015258789},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.3422999978065491},{"id":"https://openalex.org/C88485024","wikidata":"https://www.wikidata.org/wiki/Q1054571","display_name":"Cepstrum","level":2,"score":0.34060001373291016},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.3154999911785126},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.3140999972820282},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.31349998712539673},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.3100999891757965},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.3093000054359436},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.29910001158714294},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.2953000068664551},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.2822999954223633},{"id":"https://openalex.org/C2780704645","wikidata":"https://www.wikidata.org/wiki/Q9251458","display_name":"Observer (physics)","level":2,"score":0.2603999972343445},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.2565999925136566},{"id":"https://openalex.org/C104247578","wikidata":"https://www.wikidata.org/wiki/Q4779368","display_name":"Aperiodic graph","level":2,"score":0.25429999828338623},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.2502000033855438}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.23068","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.23068","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.23068","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.23068","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.4061051905155182,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Modern":[0],"Text-to-Speech":[1],"(TTS)":[2],"systems":[3,19,165],"increasingly":[4],"leverage":[5],"Large":[6],"Language":[7],"Model":[8],"(LLM)":[9],"architectures":[10],"to":[11,122,142],"achieve":[12],"scalable,":[13],"high-fidelity,":[14],"zero-shot":[15],"generation.":[16],"However,":[17],"these":[18,93],"typically":[20],"rely":[21],"on":[22],"fixed-frame-rate":[23],"acoustic":[24,78],"tokenization,":[25],"resulting":[26],"in":[27,51,58,105],"speech":[28,125],"sequences":[29],"that":[30,72,92,134,154],"are":[31],"significantly":[32,178],"longer":[33],"than,":[34],"and":[35,53,80,100,139,163,171],"asynchronous":[36],"with":[37,114,160],"their":[38],"corresponding":[39],"text.":[40],"Beyond":[41],"computational":[42],"inefficiency,":[43],"this":[44,64],"sequence":[45],"length":[46],"disparity":[47],"often":[48],"triggers":[49],"hallucinations":[50,170],"TTS":[52,162],"amplifies":[54],"the":[55,120,128,145],"modality":[56,126],"gap":[57,146],"spoken":[59],"language":[60,112],"modeling":[61,86],"(SLM).":[62],"In":[63],"paper,":[65],"we":[66],"propose":[67],"a":[68,106,110,115,177],"novel":[69],"tokenization":[70],"scheme":[71],"establishes":[73],"one-to-one":[74],"synchronization":[75],"between":[76],"continuous":[77],"features":[79],"text":[81],"tokens,":[82],"enabling":[83],"unified,":[84],"single-stream":[85],"within":[87,127],"an":[88],"LLM.":[89],"We":[90],"demonstrate":[91],"synchronous":[94],"tokens":[95],"maintain":[96],"high-fidelity":[97],"audio":[98],"reconstruction":[99],"can":[101],"be":[102],"effectively":[103],"modeled":[104],"latent":[107],"space":[108],"by":[109],"large":[111],"model":[113],"flow":[116],"matching":[117],"head.":[118],"Moreover,":[119],"ability":[121],"seamlessly":[123],"toggle":[124],"context":[129],"enables":[130],"text-only":[131,138,148],"guidance--a":[132],"technique":[133],"blends":[135],"logits":[136],"from":[137],"text-speech":[140],"modes":[141],"flexibly":[143],"bridge":[144],"toward":[147],"LLM":[149],"intelligence.":[150],"Experimental":[151],"results":[152],"indicate":[153],"our":[155],"approach":[156],"achieves":[157],"performance":[158],"competitive":[159],"state-of-the-art":[161],"SLM":[164],"while":[166],"virtually":[167],"eliminating":[168],"content":[169],"preserving":[172],"linguistic":[173],"integrity,":[174],"all":[175],"at":[176],"reduced":[179],"inference":[180],"cost.":[181]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-28T00:00:00"}
