{"id":"https://openalex.org/W4399168695","doi":"https://doi.org/10.1109/taslp.2024.3407511","title":"Waveform-Domain Speech Enhancement Using Spectrogram Encoding for Robust Speech Recognition","display_name":"Waveform-Domain Speech Enhancement Using Spectrogram Encoding for Robust Speech Recognition","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4399168695","doi":"https://doi.org/10.1109/taslp.2024.3407511"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3407511","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3407511","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/287858/1/TASLP.2024.3407511.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088361896","display_name":"Hao Shi","orcid":"https://orcid.org/0000-0003-3373-2147"},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]},{"id":"https://openalex.org/I39012071","display_name":"Kyoto College of Graduate Studies for Informatics","ror":"https://ror.org/05mzj8a56","country_code":"JP","type":"education","lineage":["https://openalex.org/I39012071"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Hao Shi","raw_affiliation_strings":["Graduate School of Informatics, Kyoto University, Kyoto, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Informatics, Kyoto University, Kyoto, Japan","institution_ids":["https://openalex.org/I39012071","https://openalex.org/I22299242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102851028","display_name":"Masato Mimura","orcid":"https://orcid.org/0000-0002-2403-0680"},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]},{"id":"https://openalex.org/I39012071","display_name":"Kyoto College of Graduate Studies for Informatics","ror":"https://ror.org/05mzj8a56","country_code":"JP","type":"education","lineage":["https://openalex.org/I39012071"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Masato Mimura","raw_affiliation_strings":["Graduate School of Informatics, Kyoto University, Kyoto, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Informatics, Kyoto University, Kyoto, Japan","institution_ids":["https://openalex.org/I39012071","https://openalex.org/I22299242"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5038044080","display_name":"Tatsuya Kawahara","orcid":"https://orcid.org/0000-0002-2686-2296"},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]},{"id":"https://openalex.org/I39012071","display_name":"Kyoto College of Graduate Studies for Informatics","ror":"https://ror.org/05mzj8a56","country_code":"JP","type":"education","lineage":["https://openalex.org/I39012071"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Tatsuya Kawahara","raw_affiliation_strings":["Graduate School of Informatics, Kyoto University, Kyoto, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Informatics, Kyoto University, Kyoto, Japan","institution_ids":["https://openalex.org/I39012071","https://openalex.org/I22299242"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5088361896"],"corresponding_institution_ids":["https://openalex.org/I22299242","https://openalex.org/I39012071"],"apc_list":null,"apc_paid":null,"fwci":6.5598,"has_fulltext":true,"cited_by_count":19,"citation_normalized_percentile":{"value":0.9750371,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":"32","issue":null,"first_page":"3049","last_page":"3060"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10326","display_name":"Indoor and Outdoor Localization Technologies","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.9686042070388794},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.7860127687454224},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7242686748504639},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7202531695365906},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6968138813972473},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6541811227798462},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5717430114746094},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4818231165409088},{"id":"https://openalex.org/keywords/time-domain","display_name":"Time domain","score":0.46218809485435486},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4565926790237427},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.45101314783096313},{"id":"https://openalex.org/keywords/frequency-domain","display_name":"Frequency domain","score":0.4472193419933319},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.10859981179237366}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.9686042070388794},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.7860127687454224},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7242686748504639},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7202531695365906},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6968138813972473},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6541811227798462},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5717430114746094},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4818231165409088},{"id":"https://openalex.org/C103824480","wikidata":"https://www.wikidata.org/wiki/Q185889","display_name":"Time domain","level":2,"score":0.46218809485435486},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4565926790237427},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.45101314783096313},{"id":"https://openalex.org/C19118579","wikidata":"https://www.wikidata.org/wiki/Q786423","display_name":"Frequency domain","level":2,"score":0.4472193419933319},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.10859981179237366},{"id":"https://openalex.org/C554190296","wikidata":"https://www.wikidata.org/wiki/Q47528","display_name":"Radar","level":2,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/taslp.2024.3407511","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3407511","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:repository.kulib.kyoto-u.ac.jp:2433/287858","is_oa":true,"landing_page_url":"http://hdl.handle.net/2433/287858","pdf_url":"https://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/287858/1/TASLP.2024.3407511.pdf","source":{"id":"https://openalex.org/S4306401454","display_name":"Kyoto University Research Information Repository (Kyoto University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I22299242","host_organization_name":"Kyoto University","host_organization_lineage":["https://openalex.org/I22299242"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Journal Article"}],"best_oa_location":{"id":"pmh:oai:repository.kulib.kyoto-u.ac.jp:2433/287858","is_oa":true,"landing_page_url":"http://hdl.handle.net/2433/287858","pdf_url":"https://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/287858/1/TASLP.2024.3407511.pdf","source":{"id":"https://openalex.org/S4306401454","display_name":"Kyoto University Research Information Repository (Kyoto University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I22299242","host_organization_name":"Kyoto University","host_organization_lineage":["https://openalex.org/I22299242"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Journal Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4399168695.pdf","grobid_xml":"https://content.openalex.org/works/W4399168695.grobid-xml"},"referenced_works_count":51,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1544785557","https://openalex.org/W1897240248","https://openalex.org/W1985090574","https://openalex.org/W1992475611","https://openalex.org/W2044893557","https://openalex.org/W2051057783","https://openalex.org/W2078528584","https://openalex.org/W2095072097","https://openalex.org/W2096779346","https://openalex.org/W2115730999","https://openalex.org/W2144404214","https://openalex.org/W2166841940","https://openalex.org/W2219249508","https://openalex.org/W2255466643","https://openalex.org/W2405774341","https://openalex.org/W2766219058","https://openalex.org/W2889442120","https://openalex.org/W2892009249","https://openalex.org/W2936774411","https://openalex.org/W2940275453","https://openalex.org/W2943554574","https://openalex.org/W2952218014","https://openalex.org/W2962892438","https://openalex.org/W2971417062","https://openalex.org/W2972389417","https://openalex.org/W2972592847","https://openalex.org/W3015912787","https://openalex.org/W3032514799","https://openalex.org/W3096408984","https://openalex.org/W3096641561","https://openalex.org/W3097777922","https://openalex.org/W3097945073","https://openalex.org/W3099330747","https://openalex.org/W3109196171","https://openalex.org/W3151851237","https://openalex.org/W3161480375","https://openalex.org/W3206531472","https://openalex.org/W3209984917","https://openalex.org/W3213726885","https://openalex.org/W4221138681","https://openalex.org/W4221156109","https://openalex.org/W4225302959","https://openalex.org/W4226390724","https://openalex.org/W4245919820","https://openalex.org/W4372267368","https://openalex.org/W4385822727","https://openalex.org/W6688816777","https://openalex.org/W6782120700","https://openalex.org/W6809874571","https://openalex.org/W6996806968"],"related_works":["https://openalex.org/W2129146436","https://openalex.org/W2064323827","https://openalex.org/W2032507829","https://openalex.org/W2782295999","https://openalex.org/W2162306796","https://openalex.org/W1970292246","https://openalex.org/W2016162169","https://openalex.org/W4247952185","https://openalex.org/W1895367623","https://openalex.org/W1642462315"],"abstract_inverted_index":{"While":[0],"waveform-domain":[1,44],"speech":[2,75],"enhancement":[3,27],"(SE)":[4],"has":[5,71],"been":[6],"extensively":[7],"investigated":[8],"in":[9,16,56,111,130],"recent":[10],"years":[11],"and":[12,25,98,127,138],"achieves":[13],"state-of-the-art":[14],"performance":[15,110],"many":[17],"datasets,":[18],"spectrogram-based":[19],"SE":[20,96],"tends":[21],"to":[22,39,93],"show":[23,107],"robust":[24,64,84],"stable":[26],"behavior.":[28],"In":[29],"this":[30],"paper,":[31],"we":[32,80],"propose":[33],"a":[34,72,145],"waveform-spectrogram":[35],"hybrid":[36],"method":[37],"(WaveSpecEnc)":[38],"improve":[40,81],"the":[41,48,95,104,119,131,139],"robustness":[42],"of":[43],"SE.":[45],"WaveSpecEnc":[46],"refines":[47],"corresponding":[49],"temporal":[50],"feature":[51],"map":[52],"by":[53,86],"spectrogram":[54,89],"encoding":[55,90],"each":[57],"encoder":[58,133],"layer.":[59],"Incorporating":[60],"spectral":[61],"information":[62,91],"provides":[63],"human":[65],"hearing":[66],"experience":[67],"performance.":[68],"However,":[69],"it":[70,82],"minor":[73],"automatic":[74],"recognition":[76],"(ASR)":[77],"improvement.":[78],"Thus,":[79],"for":[83],"ASR":[85,99,109,147],"further":[87],"utilizing":[88],"(WaveSpecEnc+)":[92],"both":[94],"front-end":[97],"back-end.":[100],"Experimental":[101],"results":[102],"using":[103,149],"CHiME-4":[105],"dataset":[106],"that":[108],"real":[112],"evaluation":[113],"sets":[114],"is":[115,135,141],"consistently":[116],"improved":[117],"with":[118,144],"proposed":[120],"method,":[121],"which":[122],"outperformed":[123],"others,":[124],"including":[125],"DEMUCS":[126],"Conv-Tasnet.":[128],"Refining":[129],"shallow":[132],"layers":[134],"very":[136],"effective,":[137],"effect":[140],"confirmed":[142],"even":[143],"strong":[146],"baseline":[148],"WavLM.":[150]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":12},{"year":2024,"cited_by_count":5}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
