{"id":"https://openalex.org/W4226021270","doi":"https://doi.org/10.1109/asru51503.2021.9687904","title":"Low-Latency Incremental Text-to-Speech Synthesis with Distilled Context Prediction Network","display_name":"Low-Latency Incremental Text-to-Speech Synthesis with Distilled Context Prediction Network","publication_year":2021,"publication_date":"2021-12-13","ids":{"openalex":"https://openalex.org/W4226021270","doi":"https://doi.org/10.1109/asru51503.2021.9687904"},"language":"en","primary_location":{"id":"doi:10.1109/asru51503.2021.9687904","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9687904","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025983709","display_name":"Takaaki Saeki","orcid":"https://orcid.org/0000-0001-6003-768X"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Takaaki Saeki","raw_affiliation_strings":["Graduate School of Information Science and Technology, The University of Tokyo,Japan","Graduate School of Information Science and Technology, The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo,Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013050263","display_name":"Shinnosuke Takamichi","orcid":"https://orcid.org/0000-0003-0520-7847"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shinnosuke Takamichi","raw_affiliation_strings":["Graduate School of Information Science and Technology, The University of Tokyo,Japan","Graduate School of Information Science and Technology, The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo,Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003814223","display_name":"Hiroshi Saruwatari","orcid":"https://orcid.org/0000-0003-0876-5617"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hiroshi Saruwatari","raw_affiliation_strings":["Graduate School of Information Science and Technology, The University of Tokyo,Japan","Graduate School of Information Science and Technology, The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo,Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5025983709"],"corresponding_institution_ids":["https://openalex.org/I74801974"],"apc_list":null,"apc_paid":null,"fwci":0.377,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.62824827,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"749","last_page":"756"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8451339602470398},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.7624216675758362},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6512298583984375},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.641899049282074},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.574065089225769},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5733014345169067},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.516809344291687},{"id":"https://openalex.org/keywords/context-model","display_name":"Context model","score":0.4832618832588196},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.47466567158699036},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.46279263496398926},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.44218388199806213},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3367305397987366}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8451339602470398},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.7624216675758362},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6512298583984375},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.641899049282074},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.574065089225769},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5733014345169067},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.516809344291687},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.4832618832588196},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.47466567158699036},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.46279263496398926},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.44218388199806213},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3367305397987366},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru51503.2021.9687904","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9687904","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.8399999737739563,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320320907","display_name":"Japan Science and Technology Corporation","ror":"https://ror.org/00097mb19"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":54,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1522301498","https://openalex.org/W1647671624","https://openalex.org/W1821462560","https://openalex.org/W1975163393","https://openalex.org/W2129336405","https://openalex.org/W2134797427","https://openalex.org/W2141708418","https://openalex.org/W2212680675","https://openalex.org/W2493916176","https://openalex.org/W2526425061","https://openalex.org/W2565875961","https://openalex.org/W2794490148","https://openalex.org/W2896457183","https://openalex.org/W2903739847","https://openalex.org/W2924902521","https://openalex.org/W2946200149","https://openalex.org/W2962780374","https://openalex.org/W2963300588","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2966983573","https://openalex.org/W2972895078","https://openalex.org/W3033411150","https://openalex.org/W3034560159","https://openalex.org/W3048023795","https://openalex.org/W3083224111","https://openalex.org/W3092316169","https://openalex.org/W3095389792","https://openalex.org/W3100608856","https://openalex.org/W3104081910","https://openalex.org/W3105422997","https://openalex.org/W3114579299","https://openalex.org/W3156563027","https://openalex.org/W3160248152","https://openalex.org/W3161782335","https://openalex.org/W3197407562","https://openalex.org/W4287761884","https://openalex.org/W4288112596","https://openalex.org/W6631190155","https://openalex.org/W6636915900","https://openalex.org/W6638523607","https://openalex.org/W6679262520","https://openalex.org/W6679909955","https://openalex.org/W6736996214","https://openalex.org/W6750226204","https://openalex.org/W6750489868","https://openalex.org/W6755207826","https://openalex.org/W6755592152","https://openalex.org/W6760732026","https://openalex.org/W6763832098","https://openalex.org/W6778823374","https://openalex.org/W6779337556","https://openalex.org/W6785521828"],"related_works":["https://openalex.org/W3107474891","https://openalex.org/W4320486724","https://openalex.org/W2963058055","https://openalex.org/W142374489","https://openalex.org/W2794438528","https://openalex.org/W2359001871","https://openalex.org/W3197304116","https://openalex.org/W3184187848","https://openalex.org/W2596494451","https://openalex.org/W4226021270"],"abstract_inverted_index":{"Incremental":[0],"text-to-speech":[1],"(TTS)":[2],"synthesis":[3,169],"generates":[4],"utterances":[5],"in":[6],"small":[7],"linguistic":[8],"units":[9],"for":[10,41,59,70],"the":[11,42,60,73,92,105,131,142,173,182],"sake":[12],"of":[13,54,68,101,135,160,177,184],"real-time":[14,188],"and":[15,164],"low-latency":[16],"applications.":[17,189],"We":[18,109],"previously":[19],"proposed":[20,143],"an":[21,85],"incremental":[22,86,168],"TTS":[23,87],"method":[24,47,56,88,144,186],"that":[25,53,57,89,141,159],"leverages":[26],"a":[27,55,65,97,114,120,126],"large":[28],"pre-trained":[29],"language":[30,74,107],"model":[31,75,123],"to":[32,52,152,158,187],"take":[33],"unobserved":[34,93],"future":[35,61,94],"context":[36,95,116,132],"into":[37,119],"account":[38],"without":[39],"waiting":[40],"subsequent":[43],"segment.":[44],"Although":[45],"this":[46,81],"achieves":[48],"comparable":[49,154],"speech":[50,156],"quality":[51,157],"waits":[58],"context,":[62],"it":[63,165],"entails":[64],"huge":[66],"amount":[67],"processing":[69],"sampling":[71,102],"from":[72,104,113],"at":[76],"each":[77],"time":[78,151],"step.":[79],"In":[80],"paper,":[82],"we":[83],"propose":[84],"directly":[90],"predicts":[91],"with":[96],"lightweight":[98],"model,":[99],"instead":[100],"words":[103],"large-scale":[106],"model.":[108],"perform":[110,167],"knowledge":[111],"distillation":[112],"GPT2-based":[115],"prediction":[117],"network":[118],"simple":[121],"recurrent":[122],"by":[124],"minimizing":[125],"teacher-student":[127],"loss":[128],"defined":[129],"between":[130],"embedding":[133],"vectors":[134],"those":[136],"models.":[137],"Experimental":[138],"results":[139],"show":[140],"requires":[145],"about":[146],"ten":[147],"times":[148],"less":[149],"inference":[150],"achieve":[153],"synthetic":[155],"our":[161,185],"previous":[162],"method,":[163],"can":[166],"much":[170],"faster":[171],"than":[172],"average":[174],"speaking":[175],"speed":[176],"human":[178],"English":[179],"speakers,":[180],"demonstrating":[181],"availability":[183]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
