{"id":"https://openalex.org/W4388118206","doi":"https://doi.org/10.23919/eusipco58844.2023.10289823","title":"W2N-AVSC: Audiovisual Extension For Whisper-To-Normal Speech Conversion","display_name":"W2N-AVSC: Audiovisual Extension For Whisper-To-Normal Speech Conversion","publication_year":2023,"publication_date":"2023-09-04","ids":{"openalex":"https://openalex.org/W4388118206","doi":"https://doi.org/10.23919/eusipco58844.2023.10289823"},"language":"en","primary_location":{"id":"doi:10.23919/eusipco58844.2023.10289823","is_oa":false,"landing_page_url":"http://dx.doi.org/10.23919/eusipco58844.2023.10289823","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 31st European Signal Processing Conference (EUSIPCO)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101792863","display_name":"Shogo Seki","orcid":"https://orcid.org/0000-0001-8284-188X"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Shogo Seki","raw_affiliation_strings":["NTT Corporation,NTT Communication Science Laboratories,Japan","NTT Communication Science Laboratories, NTT Corporation, Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,NTT Communication Science Laboratories,Japan","institution_ids":["https://openalex.org/I2251713219"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, NTT Corporation, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Kanami","orcid":null},"institutions":[{"id":"https://openalex.org/I161296585","display_name":"Tokyo University of Science","ror":"https://ror.org/05sj3n476","country_code":"JP","type":"education","lineage":["https://openalex.org/I161296585"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kanami","raw_affiliation_strings":["Graduate school of information science and technology, the university of tokyo,Japan","Graduate school of information science and technology, the university of tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate school of information science and technology, the university of tokyo,Japan","institution_ids":["https://openalex.org/I161296585"]},{"raw_affiliation_string":"Graduate school of information science and technology, the university of tokyo, Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103961833","display_name":"Imamura","orcid":null},"institutions":[{"id":"https://openalex.org/I161296585","display_name":"Tokyo University of Science","ror":"https://ror.org/05sj3n476","country_code":"JP","type":"education","lineage":["https://openalex.org/I161296585"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Imamura","raw_affiliation_strings":["Graduate school of information science and technology, the university of tokyo,Japan","Graduate school of information science and technology, the university of tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate school of information science and technology, the university of tokyo,Japan","institution_ids":["https://openalex.org/I161296585"]},{"raw_affiliation_string":"Graduate school of information science and technology, the university of tokyo, Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001243214","display_name":"Hirokazu Kameoka","orcid":"https://orcid.org/0000-0003-3102-0162"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hirokazu Kameoka","raw_affiliation_strings":["NTT Corporation,NTT Communication Science Laboratories,Japan","NTT Communication Science Laboratories, NTT Corporation, Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,NTT Communication Science Laboratories,Japan","institution_ids":["https://openalex.org/I2251713219"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, NTT Corporation, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020693766","display_name":"Takuhiro Kaneko","orcid":"https://orcid.org/0009-0000-8016-5144"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Takuhiro Kaneko","raw_affiliation_strings":["NTT Corporation,NTT Communication Science Laboratories,Japan","NTT Communication Science Laboratories, NTT Corporation, Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,NTT Communication Science Laboratories,Japan","institution_ids":["https://openalex.org/I2251713219"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, NTT Corporation, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106710403","display_name":"Kou Tanaka","orcid":"https://orcid.org/0009-0003-7107-607X"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kou Tanaka","raw_affiliation_strings":["NTT Corporation,NTT Communication Science Laboratories,Japan","NTT Communication Science Laboratories, NTT Corporation, Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,NTT Communication Science Laboratories,Japan","institution_ids":["https://openalex.org/I2251713219"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, NTT Corporation, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5054467679","display_name":"Noboru Harada","orcid":"https://orcid.org/0000-0002-1759-4533"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Noboru Harada","raw_affiliation_strings":["NTT Corporation,NTT Communication Science Laboratories,Japan","NTT Communication Science Laboratories, NTT Corporation, Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,NTT Communication Science Laboratories,Japan","institution_ids":["https://openalex.org/I2251713219"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, NTT Corporation, Japan","institution_ids":["https://openalex.org/I2251713219"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5101792863"],"corresponding_institution_ids":["https://openalex.org/I2251713219"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.14481208,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"296","last_page":"300"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9904999732971191,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/extension","display_name":"Extension (predicate logic)","score":0.6301500797271729},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6023076176643372},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5961732864379883},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5606318116188049},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5205243825912476},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.459838330745697},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3275344967842102},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.1785687804222107},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09333699941635132}],"concepts":[{"id":"https://openalex.org/C2778029271","wikidata":"https://www.wikidata.org/wiki/Q5421931","display_name":"Extension (predicate logic)","level":2,"score":0.6301500797271729},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6023076176643372},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5961732864379883},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5606318116188049},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5205243825912476},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.459838330745697},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3275344967842102},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.1785687804222107},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09333699941635132},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.23919/eusipco58844.2023.10289823","is_oa":false,"landing_page_url":"http://dx.doi.org/10.23919/eusipco58844.2023.10289823","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 31st European Signal Processing Conference (EUSIPCO)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.6499999761581421,"display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G8447587007","display_name":null,"funder_award_id":"JPMJCR19A3","funder_id":"https://openalex.org/F4320317041","funder_display_name":"Collaborative Research in Engineering, Science and Technology Centre"}],"funders":[{"id":"https://openalex.org/F4320317041","display_name":"Collaborative Research in Engineering, Science and Technology Centre","ror":null},{"id":"https://openalex.org/F4320320907","display_name":"Japan Science and Technology Corporation","ror":"https://ror.org/00097mb19"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W96541173","https://openalex.org/W1628791547","https://openalex.org/W2015143272","https://openalex.org/W2041470347","https://openalex.org/W2108501770","https://openalex.org/W2166943505","https://openalex.org/W2575406636","https://openalex.org/W2613904329","https://openalex.org/W2726515241","https://openalex.org/W2807126412","https://openalex.org/W2808631503","https://openalex.org/W2885905848","https://openalex.org/W2889061305","https://openalex.org/W2945478979","https://openalex.org/W2946555236","https://openalex.org/W2963539064","https://openalex.org/W2963887950","https://openalex.org/W2964171275","https://openalex.org/W2972394484","https://openalex.org/W2972471621","https://openalex.org/W2981087920","https://openalex.org/W2987496713","https://openalex.org/W3035626590","https://openalex.org/W3092028330","https://openalex.org/W3102628737","https://openalex.org/W3137758952","https://openalex.org/W3160305627","https://openalex.org/W4232282348","https://openalex.org/W4288054834","https://openalex.org/W4296070365","https://openalex.org/W4320013936","https://openalex.org/W4367164141","https://openalex.org/W4391602018","https://openalex.org/W6640963894","https://openalex.org/W6675944832","https://openalex.org/W6684352069","https://openalex.org/W6732408451","https://openalex.org/W6737778391","https://openalex.org/W6762533536","https://openalex.org/W6769867024","https://openalex.org/W6783867762","https://openalex.org/W6785417915"],"related_works":["https://openalex.org/W3013693939","https://openalex.org/W2159052453","https://openalex.org/W2566616303","https://openalex.org/W3131327266","https://openalex.org/W2734887215","https://openalex.org/W4297051394","https://openalex.org/W2752972570","https://openalex.org/W2145836866","https://openalex.org/W2803255133","https://openalex.org/W2909431601"],"abstract_inverted_index":{"In":[0,50],"this":[1],"paper,":[2],"we":[3,54,90,124,155],"extend":[4],"a":[5,27,61,126],"method":[6],"of":[7,40,96,117,133,160],"converting":[8],"speaking":[9,16,48],"styles":[10],"for":[11,81],"whispered":[12,41,118,153],"speech,":[13],"i.e.,":[14],"whisper-to-normal":[15],"style":[17],"conversion":[18,30,84,116],"(W2N-SC).":[19],"W2N-SC":[20,70],"problem":[21],"is":[22],"similar":[23],"but":[24],"different":[25,47],"from":[26],"regular":[28],"voice":[29],"(VC)":[31],"task":[32],"and":[33,43,59,144,151],"more":[34],"challenging":[35],"due":[36],"to":[37,74,121],"the":[38,44,56,88,99,102,115,131],"characteristics":[39],"speech":[42],"deal":[45],"with":[46],"styles.":[49],"our":[51],"previous":[52],"study,":[53],"addressed":[55],"task-specific":[57],"difficulties":[58],"developed":[60],"variational":[62],"autoencoder":[63],"(VAE)-based":[64],"non-parallel":[65],"approach":[66],"called":[67],"W2N-SC.":[68,97,170],"While":[69],"demonstrated":[71],"superior":[72],"performance":[73],"other":[75],"parallel-data-free":[76],"approaches,":[77],"there":[78],"remains":[79],"room":[80],"improvement":[82],"in":[83,114,137,142],"quality.":[85],"To":[86],"overcome":[87],"limitation,":[89],"propose":[91],"W2N-AVSC,":[92,123],"an":[93,157],"audiovisual":[94,128],"extension":[95],"Unlike":[98],"conventional":[100],"W2N-SC,":[101],"proposed":[103],"W2N-AVSC":[104,165],"can":[105],"take":[106],"visual":[107,161],"information,":[108,162],"e.g.,":[109],"lip":[110],"movements,":[111],"into":[112],"account":[113],"speech.":[119],"Furthermore,":[120],"perform":[122],"develop":[125],"new":[127],"dataset":[129],"recording":[130],"faces":[132],"speakers":[134],"reading":[135],"texts":[136],"various":[138],"ways,":[139],"such":[140],"as":[141],"normals":[143],"whispers.":[145],"Through":[146],"experimental":[147],"evaluations":[148],"using":[149],"clean":[150],"noisy":[152],"inputs,":[154],"reveal":[156],"effective":[158],"representation":[159],"demonstrating":[163],"that":[164],"perceptually":[166],"performs":[167],"better":[168],"than":[169]},"counts_by_year":[],"updated_date":"2025-12-23T23:11:35.936235","created_date":"2025-10-10T00:00:00"}
