{"id":"https://openalex.org/W1491120356","doi":"https://doi.org/10.1109/icassp.2015.7178916","title":"Multi-task deep neural network acoustic models with model adaptation using discriminative speaker identity for whisper recognition","display_name":"Multi-task deep neural network acoustic models with model adaptation using discriminative speaker identity for whisper recognition","publication_year":2015,"publication_date":"2015-04-01","ids":{"openalex":"https://openalex.org/W1491120356","doi":"https://doi.org/10.1109/icassp.2015.7178916","mag":"1491120356"},"language":"en","primary_location":{"id":"doi:10.1109/icassp.2015.7178916","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2015.7178916","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100647035","display_name":"Jingjie Li","orcid":"https://orcid.org/0000-0002-8006-7824"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jingjie Li","raw_affiliation_strings":["National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China","National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000620878","display_name":"Ian McLoughlin","orcid":"https://orcid.org/0000-0001-7111-2008"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ian McLoughlin","raw_affiliation_strings":["National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China","National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101409897","display_name":"Cong Liu","orcid":"https://orcid.org/0000-0002-2665-7153"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cong Liu","raw_affiliation_strings":["iFlytek Research","Flytek Research, China"],"affiliations":[{"raw_affiliation_string":"iFlytek Research","institution_ids":[]},{"raw_affiliation_string":"Flytek Research, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083184776","display_name":"Shaofei Xue","orcid":"https://orcid.org/0000-0001-6199-7965"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shaofei Xue","raw_affiliation_strings":["National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China","National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"National Engineering Laboratory of Speech and Language Information Processing, University of Science and Technology of China, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5050660824","display_name":"Si Wei","orcid":"https://orcid.org/0009-0009-5748-699X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Si Wei","raw_affiliation_strings":["iFlytek Research","Flytek Research, China"],"affiliations":[{"raw_affiliation_string":"iFlytek Research","institution_ids":[]},{"raw_affiliation_string":"Flytek Research, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100647035"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.01474987,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"4969","last_page":"4973"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9918000102043152,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.845861554145813},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7969421744346619},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.7507474422454834},{"id":"https://openalex.org/keywords/mixture-model","display_name":"Mixture model","score":0.6200034618377686},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.60215824842453},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.6019788384437561},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.48030751943588257},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.4518893361091614},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.45091328024864197},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4341660737991333},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.4293777346611023},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.4170233905315399},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.39299729466438293},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.2625616192817688}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.845861554145813},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7969421744346619},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.7507474422454834},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.6200034618377686},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.60215824842453},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.6019788384437561},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.48030751943588257},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.4518893361091614},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45091328024864197},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4341660737991333},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.4293777346611023},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.4170233905315399},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.39299729466438293},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2625616192817688},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icassp.2015.7178916","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2015.7178916","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:kar.kent.ac.uk:55019","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ICASSP.2015.7178916>)","pdf_url":null,"source":{"id":"https://openalex.org/S4377196264","display_name":"Kent Academic Repository (University of Kent)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I20581793","host_organization_name":"University of Kent","host_organization_lineage":["https://openalex.org/I20581793"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":null,"raw_type":"PeerReviewed"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.7200000286102295}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W217970951","https://openalex.org/W1978660892","https://openalex.org/W1985371235","https://openalex.org/W1995735739","https://openalex.org/W2022047569","https://openalex.org/W2025198378","https://openalex.org/W2055589033","https://openalex.org/W2063689849","https://openalex.org/W2079623482","https://openalex.org/W2090320273","https://openalex.org/W2098196171","https://openalex.org/W2105948329","https://openalex.org/W2140442506","https://openalex.org/W2147768505","https://openalex.org/W2154169084","https://openalex.org/W2160130475","https://openalex.org/W2160800342","https://openalex.org/W2160815625","https://openalex.org/W2294108103","https://openalex.org/W2303272654","https://openalex.org/W2400434184","https://openalex.org/W6608710415","https://openalex.org/W6670225552","https://openalex.org/W6696757691"],"related_works":["https://openalex.org/W4324119469","https://openalex.org/W2164868312","https://openalex.org/W2160650576","https://openalex.org/W1197719229","https://openalex.org/W2381158726","https://openalex.org/W1992796048","https://openalex.org/W4245698648","https://openalex.org/W2405257913","https://openalex.org/W2129090883","https://openalex.org/W3133710586"],"abstract_inverted_index":{"This":[0],"paper":[1],"presents":[2],"a":[3,137,163,196,214],"study":[4],"on":[5,118,131],"large":[6],"vocabulary":[7],"continuous":[8],"whisper":[9,79,85,139,146,174],"automatic":[10],"recognition":[11],"(wLVCSR).":[12],"wLVCSR":[13,37],"provides":[14],"the":[15,34,47,55,82,119,148,173,177,180,193,210,220],"ability":[16],"to":[17,46,57,63,73,109,122,192],"use":[18],"ASR":[19],"equipment":[20],"in":[21,78,128],"public":[22],"places":[23],"without":[24],"concern":[25],"for":[26,88,94,183],"disturbing":[27],"others":[28],"or":[29],"leaking":[30],"private":[31],"information.":[32,135],"However":[33],"task":[35],"of":[36,49,61,84,145,179,195,204],"is":[38,90,116,190],"much":[39,65,91],"more":[40],"challenging":[41],"than":[42,67,93],"normal":[43,68,95,184],"LVCSR":[44],"due":[45],"absence":[48],"pitch":[50],"which":[51,189],"not":[52],"only":[53,171],"causes":[54],"signal":[56],"noise":[58],"ratio":[59],"(SNR)":[60],"whispers":[62,129],"be":[64],"lower":[66],"speech":[69,185,205],"but":[70],"also":[71],"leads":[72],"flatness":[74],"and":[75,125],"formant":[76],"shifts":[77],"spectra.":[80],"Furthermore,":[81],"amount":[83],"data":[86],"available":[87],"training":[89],"less":[92],"speech.":[96],"In":[97],"this":[98,208],"paper,":[99],"multi-task":[100,120,151],"deep":[101],"neural":[102],"network":[103],"(DNN)":[104],"acoustic":[105],"models":[106],"are":[107],"deployed":[108],"solve":[110],"these":[111],"problems.":[112],"Moreover,":[113],"model":[114,153,182],"adaptation":[115],"performed":[117],"DNN":[121,152,198,212],"normalize":[123],"speaker":[124,133],"environmental":[126],"variability":[127],"based":[130],"discriminative":[132],"identity":[134],"On":[136],"Mandarin":[138],"dictation":[140],"task,":[141],"with":[142,200],"55":[143],"hours":[144,203],"data,":[147],"proposed":[149,181],"SI":[150],"can":[154,186],"achieve":[155],"56.7%":[156],"character":[157],"error":[158],"rate":[159],"(CER)":[160],"improvement":[161],"over":[162,219],"baseline":[164],"Gaussian":[165],"Mixture":[166],"Model":[167],"(GMM),":[168],"discriminatively":[169],"trained":[170,199],"using":[172],"data.":[175,206],"Besides,":[176],"CER":[178,217],"reach":[187],"15.2%,":[188],"close":[191],"performance":[194],"state-of-the-art":[197],"one":[201],"thousand":[202],"From":[207],"baseline,":[209],"model-adapted":[211],"gains":[213],"further":[215],"10.9%":[216],"reduction":[218],"generic":[221],"model.":[222]},"counts_by_year":[{"year":2022,"cited_by_count":1},{"year":2020,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
