{"id":"https://openalex.org/W4405974170","doi":"https://doi.org/10.1109/msp.2024.3486469","title":"Module-Based End-to-End Distant Speech Processing: A case study of far-field automatic speech recognition","display_name":"Module-Based End-to-End Distant Speech Processing: A case study of far-field automatic speech recognition","publication_year":2024,"publication_date":"2024-11-01","ids":{"openalex":"https://openalex.org/W4405974170","doi":"https://doi.org/10.1109/msp.2024.3486469"},"language":"en","primary_location":{"id":"doi:10.1109/msp.2024.3486469","is_oa":false,"landing_page_url":"https://doi.org/10.1109/msp.2024.3486469","pdf_url":null,"source":{"id":"https://openalex.org/S120977877","display_name":"IEEE Signal Processing Magazine","issn_l":"1053-5888","issn":["1053-5888","1558-0792"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Magazine","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5050058892","display_name":"Xuankai Chang","orcid":"https://orcid.org/0000-0002-5221-5412"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Xuankai Chang","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, PA, USA"],"raw_orcid":"https://orcid.org/0000-0002-5221-5412","affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001291873","display_name":"Shinji Watanabe","orcid":"https://orcid.org/0000-0002-5970-8631"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, PA, USA"],"raw_orcid":"https://orcid.org/0000-0002-5970-8631","affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023868166","display_name":"Marc Delcroix","orcid":"https://orcid.org/0000-0002-5175-7834"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Marc Delcroix","raw_affiliation_strings":["NTT Communication Science Laboratories, Kyoto, Japan"],"raw_orcid":"https://orcid.org/0000-0002-5175-7834","affiliations":[{"raw_affiliation_string":"NTT Communication Science Laboratories, Kyoto, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070624983","display_name":"Tsubasa Ochiai","orcid":"https://orcid.org/0000-0002-2519-2032"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Tsubasa Ochiai","raw_affiliation_strings":["NTT Communication Science Laboratories, Kyoto, Japan"],"raw_orcid":"https://orcid.org/0000-0002-2519-2032","affiliations":[{"raw_affiliation_string":"NTT Communication Science Laboratories, Kyoto, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071937621","display_name":"Wangyou Zhang","orcid":"https://orcid.org/0000-0003-4500-3515"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wangyou Zhang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-4500-3515","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100341993","display_name":"Yanmin Qian","orcid":"https://orcid.org/0000-0002-0314-3790"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanmin Qian","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-0314-3790","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5050058892"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":1.3153,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.82097227,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":"41","issue":"6","first_page":"39","last_page":"50"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9577999711036682,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9577999711036682,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9383000135421753,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9003999829292297,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7810268998146057},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7654356956481934},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.7316197752952576},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.604161262512207},{"id":"https://openalex.org/keywords/signal-processing","display_name":"Signal processing","score":0.5954551100730896},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.5334981083869934},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.5329001545906067},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.47078269720077515},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.4510171711444855},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.42429831624031067},{"id":"https://openalex.org/keywords/digital-signal-processing","display_name":"Digital signal processing","score":0.32921627163887024},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3236328661441803},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.30801254510879517},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.11895212531089783}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7810268998146057},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7654356956481934},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.7316197752952576},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.604161262512207},{"id":"https://openalex.org/C104267543","wikidata":"https://www.wikidata.org/wiki/Q208163","display_name":"Signal processing","level":3,"score":0.5954551100730896},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.5334981083869934},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.5329001545906067},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.47078269720077515},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.4510171711444855},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.42429831624031067},{"id":"https://openalex.org/C84462506","wikidata":"https://www.wikidata.org/wiki/Q173142","display_name":"Digital signal processing","level":2,"score":0.32921627163887024},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3236328661441803},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.30801254510879517},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.11895212531089783},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/msp.2024.3486469","is_oa":false,"landing_page_url":"https://doi.org/10.1109/msp.2024.3486469","pdf_url":null,"source":{"id":"https://openalex.org/S120977877","display_name":"IEEE Signal Processing Magazine","issn_l":"1053-5888","issn":["1053-5888","1558-0792"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Magazine","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":40,"referenced_works":["https://openalex.org/W1485161427","https://openalex.org/W1966812932","https://openalex.org/W1973669708","https://openalex.org/W1989364685","https://openalex.org/W1991139021","https://openalex.org/W2042141988","https://openalex.org/W2060108923","https://openalex.org/W2128653836","https://openalex.org/W2130361043","https://openalex.org/W2146324387","https://openalex.org/W2160815625","https://openalex.org/W2165712214","https://openalex.org/W2221409856","https://openalex.org/W2288645994","https://openalex.org/W2508393166","https://openalex.org/W2510867321","https://openalex.org/W2734774145","https://openalex.org/W2803322398","https://openalex.org/W2939690918","https://openalex.org/W2952218014","https://openalex.org/W2962866211","https://openalex.org/W2964058413","https://openalex.org/W2982471419","https://openalex.org/W3008762051","https://openalex.org/W3020336359","https://openalex.org/W3086154751","https://openalex.org/W3094821064","https://openalex.org/W3133902371","https://openalex.org/W3160207687","https://openalex.org/W3211278025","https://openalex.org/W4233392025","https://openalex.org/W4281492411","https://openalex.org/W4312356750","https://openalex.org/W4367281387","https://openalex.org/W4372267974","https://openalex.org/W4386763469","https://openalex.org/W6735168207","https://openalex.org/W6781130467","https://openalex.org/W6847363464","https://openalex.org/W6856665624"],"related_works":["https://openalex.org/W2496295964","https://openalex.org/W1976952689","https://openalex.org/W2336887028","https://openalex.org/W1911859126","https://openalex.org/W642007152","https://openalex.org/W1611900921","https://openalex.org/W2064012922","https://openalex.org/W1620668332","https://openalex.org/W2033602688","https://openalex.org/W2156141367"],"abstract_inverted_index":{"Distant":[0],"speech":[1,9,33,52,123],"processing":[2,126],"is":[3,78,136],"a":[4,89,114],"critical":[5],"downstream":[6,159],"application":[7],"in":[8,63,72,100,110],"and":[10,27,40,124,156,163],"audio":[11,165],"signal":[12,125,166],"processing.":[13,167],"Traditionally,":[14],"researchers":[15],"have":[16],"addressed":[17],"this":[18,129],"challenge":[19],"by":[20],"breaking":[21],"it":[22,135],"down":[23],"into":[24],"distinct":[25],"subproblems":[26],"encompassing":[28],"the":[29,46,70,73,80,111,132,145],"extraction":[30],"of":[31,48,75,82,113,148,161],"clean":[32],"signals":[34],"from":[35],"noisy":[36],"inputs,":[37],"feature":[38],"extraction,":[39],"transcription.":[41],"This":[42,142],"approach":[43,130],"led":[44],"to":[45,66,94],"development":[47,147],"modular":[49,115],"distant":[50],"automatic":[51],"recognition":[53],"(DASR)":[54],"models,":[55],"which":[56],"are":[57],"often":[58,121],"designed":[59],"with":[60],"multiple":[61],"stages":[62],"cascade,":[64],"corresponding":[65],"specific":[67],"subproblems.":[68],"Recently,":[69],"surge":[71],"capabilities":[74],"deep":[76],"learning":[77],"propelling":[79],"popularity":[81],"purely":[83],"end-to-end":[84],"(E2E)":[85],"models":[86,155],"that":[87],"employ":[88],"single":[90],"large":[91],"neural":[92],"network":[93],"tackle":[95],"an":[96,101,106,139],"entire":[97],"DASR":[98,149],"task":[99],"extremely":[102],"data-driven":[103,164],"manner.":[104],"However,":[105],"alternative":[107],"paradigm":[108],"persists":[109],"form":[112],"model":[116],"design,":[117],"where":[118],"we":[119],"can":[120],"leverage":[122],"models.":[127],"Although":[128],"mirrors":[131],"multistage":[133],"model,":[134],"trained":[137],"through":[138],"E2E":[140,153],"process.":[141],"article":[143],"overviews":[144],"recent":[146],"systems,":[150],"focusing":[151],"on":[152],"module-based":[154],"showcasing":[157],"successful":[158],"applications":[160],"model-based":[162]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-19T19:40:27.379048","created_date":"2025-01-02T00:00:00"}
