{"id":"https://openalex.org/W2402539796","doi":"https://doi.org/10.1109/icassp.2016.7472730","title":"From HMMS to DNNS: Where do the improvements come from?","display_name":"From HMMS to DNNS: Where do the improvements come from?","publication_year":2016,"publication_date":"2016-03-01","ids":{"openalex":"https://openalex.org/W2402539796","doi":"https://doi.org/10.1109/icassp.2016.7472730","mag":"2402539796"},"language":"en","primary_location":{"id":"doi:10.1109/icassp.2016.7472730","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2016.7472730","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110238677","display_name":"Oliver Watts","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Oliver Watts","raw_affiliation_strings":["The Centre for Speech Technology Research, University of Edinburgh, United Kingdom"],"affiliations":[{"raw_affiliation_string":"The Centre for Speech Technology Research, University of Edinburgh, United Kingdom","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091432739","display_name":"Gustav Eje Henter","orcid":"https://orcid.org/0000-0002-1643-1054"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Gustav Eje Henter","raw_affiliation_strings":["The Centre for Speech Technology Research, University of Edinburgh, United Kingdom"],"affiliations":[{"raw_affiliation_string":"The Centre for Speech Technology Research, University of Edinburgh, United Kingdom","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038417409","display_name":"Thomas Merritt","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Thomas Merritt","raw_affiliation_strings":["The Centre for Speech Technology Research, University of Edinburgh, United Kingdom"],"affiliations":[{"raw_affiliation_string":"The Centre for Speech Technology Research, University of Edinburgh, United Kingdom","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102765381","display_name":"Zhizheng Wu","orcid":"https://orcid.org/0009-0001-1192-9857"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Zhizheng Wu","raw_affiliation_strings":["The Centre for Speech Technology Research, University of Edinburgh, United Kingdom"],"affiliations":[{"raw_affiliation_string":"The Centre for Speech Technology Research, University of Edinburgh, United Kingdom","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5062516688","display_name":"Simon King","orcid":"https://orcid.org/0000-0002-2694-2843"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Simon King","raw_affiliation_strings":["The Centre for Speech Technology Research, University of Edinburgh, United Kingdom"],"affiliations":[{"raw_affiliation_string":"The Centre for Speech Technology Research, University of Edinburgh, United Kingdom","institution_ids":["https://openalex.org/I98677209"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5110238677"],"corresponding_institution_ids":["https://openalex.org/I98677209"],"apc_list":null,"apc_paid":null,"fwci":12.96633644,"has_fulltext":false,"cited_by_count":67,"citation_normalized_percentile":{"value":0.99406486,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"5505","last_page":"5509"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.8614716529846191},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7804121971130371},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.7384311556816101},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5792416334152222},{"id":"https://openalex.org/keywords/deep-neural-networks","display_name":"Deep neural networks","score":0.5532318949699402},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5428478717803955},{"id":"https://openalex.org/keywords/decision-tree","display_name":"Decision tree","score":0.5277156233787537},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5171229839324951},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.48736101388931274},{"id":"https://openalex.org/keywords/parametric-statistics","display_name":"Parametric statistics","score":0.4627513885498047},{"id":"https://openalex.org/keywords/state","display_name":"State (computer science)","score":0.43749162554740906},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.42609328031539917},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.4110979437828064},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.10512563586235046},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.08404451608657837}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.8614716529846191},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7804121971130371},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.7384311556816101},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5792416334152222},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.5532318949699402},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5428478717803955},{"id":"https://openalex.org/C84525736","wikidata":"https://www.wikidata.org/wiki/Q831366","display_name":"Decision tree","level":2,"score":0.5277156233787537},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5171229839324951},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48736101388931274},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.4627513885498047},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.43749162554740906},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.42609328031539917},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4110979437828064},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.10512563586235046},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.08404451608657837},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icassp.2016.7472730","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2016.7472730","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.ed.ac.uk:publications/befaa174-b0ab-428f-9e37-5ec795486a8c","is_oa":false,"landing_page_url":"https://www.research.ed.ac.uk/en/publications/befaa174-b0ab-428f-9e37-5ec795486a8c","pdf_url":null,"source":{"id":"https://openalex.org/S4306400320","display_name":"Edinburgh Research Explorer (University of Edinburgh)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I98677209","host_organization_name":"University of Edinburgh","host_organization_lineage":["https://openalex.org/I98677209"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":""}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.8199999928474426}],"awards":[{"id":"https://openalex.org/G4055593462","display_name":null,"funder_award_id":"EP/I031022/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W192649799","https://openalex.org/W202879582","https://openalex.org/W1499332833","https://openalex.org/W1502723613","https://openalex.org/W1544516254","https://openalex.org/W1982456116","https://openalex.org/W1990505856","https://openalex.org/W1998362482","https://openalex.org/W2049686551","https://openalex.org/W2059610484","https://openalex.org/W2102003408","https://openalex.org/W2129142580","https://openalex.org/W2134973740","https://openalex.org/W2152175008","https://openalex.org/W2157867825","https://openalex.org/W2188827208","https://openalex.org/W2294797155","https://openalex.org/W2394658756","https://openalex.org/W2400063444","https://openalex.org/W2401318443","https://openalex.org/W2404881427","https://openalex.org/W2405144467","https://openalex.org/W2806733747","https://openalex.org/W4231807801","https://openalex.org/W6608197479","https://openalex.org/W6675380101","https://openalex.org/W6696843773","https://openalex.org/W6712026646","https://openalex.org/W6713281572","https://openalex.org/W6713860685","https://openalex.org/W6752004635"],"related_works":["https://openalex.org/W2029561777","https://openalex.org/W1554502231","https://openalex.org/W172797710","https://openalex.org/W2945105049","https://openalex.org/W4387098302","https://openalex.org/W2626699140","https://openalex.org/W3165080709","https://openalex.org/W2948317131","https://openalex.org/W4288365855","https://openalex.org/W2111961547"],"abstract_inverted_index":{"Deep":[0],"neural":[1],"networks":[2],"(DNNs)":[3],"have":[4,31],"recently":[5],"been":[6,32],"the":[7,35,48,53,68,72,81,106,109,115,175],"focus":[8],"of":[9,37,47,67,108,117,136,170],"much":[10,46,60],"text-to-speech":[11],"research":[12],"as":[13],"a":[14,134,141,147],"replacement":[15],"for":[16,102],"decision":[17,73,153],"trees":[18,74,85,154],"and":[19,58,83,157],"hidden":[20],"Markov":[21],"models":[22],"(HMMs)":[23],"in":[24,75],"statistical":[25],"parametric":[26],"synthesis":[27],"systems.":[28,69,176],"Performance":[29],"improvements":[30],"reported;":[33],"however,":[34],"configuration":[36],"systems":[38,77,95,137],"evaluated":[39],"makes":[40],"it":[41],"impossible":[42],"to":[43,52,63,88,98,161,181,187],"judge":[44],"how":[45,59],"improvement":[49,178],"is":[50,61,144,179],"due":[51,62],"new":[54],"machine":[55],"learning":[56,121],"methods,":[57],"other":[64],"novel":[65],"aspects":[66],"Specifically,":[70],"whereas":[71],"HMM-based":[76],"typically":[78],"operate":[79],"at":[80,105,146],"state-level,":[82],"separate":[84,90,127],"are":[86,96],"used":[87],"handle":[89],"acoustic":[91,110],"streams,":[92],"most":[93],"DNN-based":[94],"trained":[97],"make":[99],"predictions":[100,163],"simultaneously":[101],"all":[103],"streams":[104],"level":[107],"frame.":[111],"This":[112],"paper":[113],"isolates":[114],"influence":[116],"three":[118],"factors":[119],"(machine":[120],"method;":[122],"state":[123],"vs.":[124,128],"frame":[125],"predictions;":[126],"combined":[129],"stream":[130],"predictions)":[131],"by":[132,174],"building":[133],"continuum":[135],"along":[138],"which":[139],"only":[140],"single":[142],"factor":[143],"varied":[145],"time.":[148],"We":[149],"find":[150],"that":[151],"replacing":[152],"with":[155],"DNNs":[156],"moving":[158],"from":[159,183,185],"state-level":[160],"frame-level":[162],"both":[164],"significantly":[165],"improve":[166],"listeners'":[167],"naturalness":[168],"ratings":[169],"synthetic":[171],"speech":[172],"produced":[173],"No":[177],"found":[180],"result":[182],"switching":[184],"separate-stream":[186],"combined-stream":[188],"predictions.":[189]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":5},{"year":2021,"cited_by_count":6},{"year":2020,"cited_by_count":8},{"year":2019,"cited_by_count":13},{"year":2018,"cited_by_count":12},{"year":2017,"cited_by_count":13},{"year":2016,"cited_by_count":8}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
