{"id":"https://openalex.org/W7162539133","doi":"https://doi.org/10.48550/arxiv.2605.27258","title":"PilotTTS: A Disciplined Modular Recipe for Competitive Speech Synthesis","display_name":"PilotTTS: A Disciplined Modular Recipe for Competitive Speech Synthesis","publication_year":2026,"publication_date":"2026-05-26","ids":{"openalex":"https://openalex.org/W7162539133","doi":"https://doi.org/10.48550/arxiv.2605.27258"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.27258","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27258","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.27258","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137147112","display_name":"Bowen Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Bowen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000235693","display_name":"Shaotong Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Shaotong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137150533","display_name":"Zhen Wang","orcid":"https://orcid.org/0009-0005-5376-8325"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137149731","display_name":"Yang Xiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiang, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078798920","display_name":"Mingli Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Mingli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137128394","display_name":"Yihang Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Yihang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137163973","display_name":"Jiahui Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Jiahui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023506400","display_name":"Weibo Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Weibo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137188437","display_name":"Dongrui Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Dongrui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137135225","display_name":"Keming Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Keming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137170142","display_name":"Yunze Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Yunze","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110499906","display_name":"Yuze Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Zeyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137109198","display_name":"Zeyang Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yuze","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137138363","display_name":"Yue Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.47130000591278076,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.47130000591278076,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.07320000231266022,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12488","display_name":"Mental Health via Writing","score":0.06689999997615814,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.8276000022888184},{"id":"https://openalex.org/keywords/paralanguage","display_name":"Paralanguage","score":0.7221999764442444},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.574400007724762},{"id":"https://openalex.org/keywords/recipe","display_name":"Recipe","score":0.5263000130653381},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4618000090122223},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.42100000381469727},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4185999929904938},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.39169999957084656}],"concepts":[{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.8276000022888184},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7674000263214111},{"id":"https://openalex.org/C133378560","wikidata":"https://www.wikidata.org/wiki/Q1753225","display_name":"Paralanguage","level":2,"score":0.7221999764442444},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.574400007724762},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5390999913215637},{"id":"https://openalex.org/C2778671685","wikidata":"https://www.wikidata.org/wiki/Q219239","display_name":"Recipe","level":2,"score":0.5263000130653381},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4618000090122223},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.45080000162124634},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.42100000381469727},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4185999929904938},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3961000144481659},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.39169999957084656},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.35249999165534973},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.35100001096725464},{"id":"https://openalex.org/C2780366754","wikidata":"https://www.wikidata.org/wiki/Q7494857","display_name":"Speech translation","level":3,"score":0.3327000141143799},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.313400000333786},{"id":"https://openalex.org/C16910744","wikidata":"https://www.wikidata.org/wiki/Q7705759","display_name":"Test data","level":2,"score":0.3061000108718872},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.27869999408721924},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.26179999113082886},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.2578999996185303},{"id":"https://openalex.org/C98025372","wikidata":"https://www.wikidata.org/wiki/Q477538","display_name":"Systems architecture","level":3,"score":0.25619998574256897}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.27258","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27258","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.27258","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27258","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Building":[0],"state-of-the-art":[1],"text-to-speech":[2],"(TTS)":[3],"systems":[4,155],"typically":[5],"demands":[6],"millions":[7],"of":[8,10,53,132,138],"hours":[9,52],"proprietary":[11],"data":[12,44,54,68,165],"and":[13,42,76,78,116,142,152,170],"complex":[14],"multi-stage":[15,67],"architectures,":[16],"creating":[17],"substantial":[18],"barriers":[19],"for":[20],"resource-constrained":[21],"research":[22],"teams.":[23],"In":[24],"this":[25],"report,":[26],"we":[27],"present":[28],"PilotTTS,":[29],"a":[30,65,80,100,136],"lightweight":[31],"autoregressive":[32],"TTS":[33],"system":[34],"that":[35,84],"achieves":[36,128],"competitive":[37],"performance":[38],"through":[39],"minimalist":[40],"architecture":[41,83],"rigorous":[43],"engineering.":[45],"PilotTTS":[46,103,127],"is":[47],"trained":[48,156],"on":[49,134,140,147,157],"only":[50],"200K":[51],"processed":[55],"entirely":[56],"with":[57],"open-source":[58],"tools.":[59],"Specifically,":[60],"our":[61],"contributions":[62],"are:":[63],"(1)":[64],"reproducible":[66],"processing":[69],"pipeline":[70,166],"covering":[71],"quality":[72],"assessment,":[73],"label":[74],"annotation,":[75],"filtering,":[77],"(2)":[79],"compact":[81],"model":[82],"employs":[85],"Q-Former-based":[86],"conditioning":[87],"to":[88],"decouple":[89],"speaker":[90,145],"identity":[91],"from":[92],"speaking":[93],"style":[94],"via":[95],"cross-sample":[96],"paired":[97],"training.":[98],"Within":[99],"unified":[101],"framework,":[102],"supports":[104],"zero-shot":[105],"voice":[106],"cloning,":[107],"emotion":[108],"synthesis":[109,113,119],"(11":[110],"categories),":[111,115],"paralinguistic":[112],"(4":[114],"Chinese":[117],"dialect":[118],"(14":[120],"dialects).":[121],"On":[122],"the":[123,129,143,163],"Seed-TTS":[124],"Eval":[125],"benchmark,":[126],"lowest":[130],"WER":[131],"1.50%":[133],"test-en,":[135],"CER":[137],"0.87%":[139],"test-zh,":[141],"highest":[144],"similarity":[146],"both":[148],"test":[149],"sets":[150],"(0.862":[151],"0.815),":[153],"outperforming":[154],"significantly":[158],"larger":[159],"datasets.":[160],"We":[161],"release":[162],"complete":[164],"recipe,":[167],"pretrained":[168],"weights,":[169],"code":[171],"at":[172],"https://github.com/AMAPVOICE/PilotTTS.":[173]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-28T00:00:00"}
