{"id":"https://openalex.org/W4408353088","doi":"https://doi.org/10.1109/icassp49660.2025.10889461","title":"DPI-TTS: Directional Patch Interaction for Fast-Converging and Style Temporal Modeling in Text-to-Speech","display_name":"DPI-TTS: Directional Patch Interaction for Fast-Converging and Style Temporal Modeling in Text-to-Speech","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408353088","doi":"https://doi.org/10.1109/icassp49660.2025.10889461"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10889461","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10889461","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5115076772","display_name":"Xin Qi","orcid":"https://orcid.org/0000-0001-9279-7296"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Qi","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073918837","display_name":"Ruibo Fu","orcid":"https://orcid.org/0000-0001-9598-1881"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruibo Fu","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111856667","display_name":"Zhengqi Wen","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhengqi Wen","raw_affiliation_strings":["Tsinghua University,Beijing National Research Center for Information Science and Technology,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Beijing National Research Center for Information Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100453420","display_name":"Tao Wang","orcid":"https://orcid.org/0000-0001-8099-9704"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Wang","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028353824","display_name":"Chunyu Qiang","orcid":"https://orcid.org/0009-0007-2290-3074"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chunyu Qiang","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100744502","display_name":"Jianhua Tao","orcid":"https://orcid.org/0000-0002-0477-587X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianhua Tao","raw_affiliation_strings":["Tsinghua University,Beijing National Research Center for Information Science and Technology,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Beijing National Research Center for Information Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050682332","display_name":"Chenxing Li","orcid":"https://orcid.org/0000-0002-3997-8212"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chenxing Li","raw_affiliation_strings":["Tencent,AI Lab,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tencent,AI Lab,Beijing,China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102848539","display_name":"Yi Lu","orcid":"https://orcid.org/0000-0001-7652-313X"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Lu","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111241695","display_name":"Shuchen Shi","orcid":null},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuchen Shi","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100443935","display_name":"Zhiyong Wang","orcid":"https://orcid.org/0000-0002-8043-0312"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiyong Wang","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111252012","display_name":"Xiaopeng Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaopeng Wang","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044573835","display_name":"Yuankun Xie","orcid":"https://orcid.org/0000-0002-8366-9011"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuankun Xie","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113270837","display_name":"Yukun Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yukun Liu","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104324475","display_name":"Xuefei Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuefei Liu","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101679484","display_name":"Guanjun Li","orcid":"https://orcid.org/0000-0002-8200-5360"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guanjun Li","raw_affiliation_strings":["Chinese Academy of Science,Institute of Automation,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Science,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":15,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.7588,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.85570113,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9890000224113464,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9890000224113464,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9797999858856201,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9724000096321106,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7041667103767395},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6130313873291016},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.552083432674408},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3324007987976074},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3305777907371521},{"id":"https://openalex.org/keywords/history","display_name":"History","score":0.04949057102203369}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7041667103767395},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6130313873291016},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.552083432674408},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3324007987976074},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3305777907371521},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.04949057102203369},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10889461","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10889461","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.4099999964237213,"display_name":"Reduced inequalities"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335869","display_name":"National Social Science Fund of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1901129140","https://openalex.org/W2964243274","https://openalex.org/W3198213150","https://openalex.org/W4224931676","https://openalex.org/W4225566824","https://openalex.org/W4285605725","https://openalex.org/W4304099317","https://openalex.org/W4387968164","https://openalex.org/W4388979610","https://openalex.org/W4390872297","https://openalex.org/W4391020683","https://openalex.org/W4391021751","https://openalex.org/W4401609042","https://openalex.org/W4402389607","https://openalex.org/W6610566761","https://openalex.org/W6640963894","https://openalex.org/W6739901393","https://openalex.org/W6763832098","https://openalex.org/W6778823374","https://openalex.org/W6783182287","https://openalex.org/W6795261426","https://openalex.org/W6796464841","https://openalex.org/W6838630680","https://openalex.org/W6840257003","https://openalex.org/W6851724922","https://openalex.org/W6862059674","https://openalex.org/W6869275737","https://openalex.org/W6869691399","https://openalex.org/W6917585676","https://openalex.org/W6936113694"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W3204019825"],"abstract_inverted_index":{"In":[0],"recent":[1],"years,":[2],"speech":[3,30],"diffusion":[4],"models":[5,16,31],"have":[6,23],"advanced":[7],"rapidly.":[8],"Alongside":[9],"the":[10,19,40,90,93,118,128],"widely":[11],"used":[12],"U-Net":[13],"architecture,":[14],"transformer-based":[15],"such":[17],"as":[18,35],"Diffusion":[20],"Transformer":[21],"(DiT)":[22],"also":[24],"gained":[25],"attention.":[26],"However,":[27],"current":[28],"DiT":[29,64],"treat":[32],"Mel":[33],"spectrograms":[34],"general":[36],"images,":[37],"which":[38,61],"overlooks":[39],"specific":[41],"acoustic":[42,87],"properties":[43],"of":[44,92],"speech.":[45,95],"To":[46],"address":[47],"these":[48],"limitations,":[49],"we":[50,97],"propose":[51],"a":[52,75,99],"method":[53,104,116],"called":[54],"Directional":[55],"Patch":[56],"Interaction":[57],"for":[58],"Text-to-Speech":[59],"(DPI-TTS),":[60],"builds":[62],"on":[63],"and":[65,125],"achieves":[66],"fast":[67],"training":[68,119],"without":[69],"compromising":[70],"accuracy.":[71],"Notably,":[72],"DPI-TTS":[73],"employs":[74],"low-to-high":[76],"frequency,":[77],"frame-by-frame":[78],"progressive":[79],"inference":[80],"approach":[81],"that":[82,105,114],"aligns":[83],"more":[84],"closely":[85],"with":[86],"properties,":[88],"enhancing":[89],"naturalness":[91],"generated":[94],"Additionally,":[96],"introduce":[98],"fine-grained":[100],"style":[101,109],"temporal":[102],"modeling":[103],"further":[106],"improves":[107],"speaker":[108],"similarity.":[110],"Experimental":[111],"results":[112],"demonstrate":[113],"our":[115],"increases":[117],"speed":[120],"by":[121],"nearly":[122],"2":[123],"times":[124],"significantly":[126],"outperforms":[127],"baseline":[129],"models.":[130]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
