{"id":"https://openalex.org/W2808998745","doi":"https://doi.org/10.21437/interspeech.2018-2414","title":"End-to-End Speech Recognition from the Raw Waveform","display_name":"End-to-End Speech Recognition from the Raw Waveform","publication_year":2018,"publication_date":"2018-08-28","ids":{"openalex":"https://openalex.org/W2808998745","doi":"https://doi.org/10.21437/interspeech.2018-2414","mag":"2808998745"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2018-2414","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2018-2414","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2018","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1806.07098","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047639590","display_name":"Neil Zeghidour","orcid":"https://orcid.org/0000-0001-6896-3987"},"institutions":[{"id":"https://openalex.org/I277688954","display_name":"Universit\u00e9 Paris-Saclay","ror":"https://ror.org/03xjwb503","country_code":"FR","type":"education","lineage":["https://openalex.org/I277688954"]}],"countries":["FR"],"is_corresponding":true,"raw_author_name":"Neil Zeghidour","raw_affiliation_strings":["Universit\u00e9 Paris-Saclay"],"affiliations":[{"raw_affiliation_string":"Universit\u00e9 Paris-Saclay","institution_ids":["https://openalex.org/I277688954"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084360449","display_name":"Nicolas Usunier","orcid":"https://orcid.org/0000-0002-9324-1457"},"institutions":[{"id":"https://openalex.org/I2252078561","display_name":"Meta (Israel)","ror":"https://ror.org/02388em19","country_code":"IL","type":"company","lineage":["https://openalex.org/I2252078561","https://openalex.org/I4210114444"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Nicolas Usunier","raw_affiliation_strings":["[Facebook AI Research, Paris]"],"affiliations":[{"raw_affiliation_string":"[Facebook AI Research, Paris]","institution_ids":["https://openalex.org/I2252078561"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041907084","display_name":"Gabriel Synnaeve","orcid":"https://orcid.org/0000-0003-1715-3356"},"institutions":[{"id":"https://openalex.org/I4210104430","display_name":"Laboratoire d'Informatique de Grenoble","ror":"https://ror.org/01c8rcg82","country_code":"FR","type":"facility","lineage":["https://openalex.org/I106785703","https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I1326498283","https://openalex.org/I4210104430","https://openalex.org/I4210159245","https://openalex.org/I899635006","https://openalex.org/I899635006"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Gabriel Synnaeve","raw_affiliation_strings":["Laboratoire d'Informatique de Grenoble"],"affiliations":[{"raw_affiliation_string":"Laboratoire d'Informatique de Grenoble","institution_ids":["https://openalex.org/I4210104430"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053915453","display_name":"Ronan Collobert","orcid":null},"institutions":[{"id":"https://openalex.org/I7495430","display_name":"Idiap Research Institute","ror":"https://ror.org/05932h694","country_code":"CH","type":"facility","lineage":["https://openalex.org/I7495430"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Ronan Collobert","raw_affiliation_strings":["IDIAP RESEARCH INSTITUTE"],"affiliations":[{"raw_affiliation_string":"IDIAP RESEARCH INSTITUTE","institution_ids":["https://openalex.org/I7495430"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5007620149","display_name":"Emmanuel Dupoux","orcid":"https://orcid.org/0000-0002-7814-2952"},"institutions":[{"id":"https://openalex.org/I4210151031","display_name":"Laboratoire de Sciences Cognitives et Psycholinguistique","ror":"https://ror.org/05fvhm231","country_code":"FR","type":"facility","lineage":["https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I2746051580","https://openalex.org/I29607241","https://openalex.org/I4210096427","https://openalex.org/I4210151031","https://openalex.org/I90669466"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Emmanuel Dupoux","raw_affiliation_strings":["Laboratoire de sciences cognitives et psycholinguistique"],"affiliations":[{"raw_affiliation_string":"Laboratoire de sciences cognitives et psycholinguistique","institution_ids":["https://openalex.org/I4210151031"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5047639590"],"corresponding_institution_ids":["https://openalex.org/I277688954"],"apc_list":null,"apc_paid":null,"fwci":2.18418705,"has_fulltext":false,"cited_by_count":12,"citation_normalized_percentile":{"value":0.8857945,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"781","last_page":"785"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.691847562789917},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.6785414218902588},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.6699920296669006},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6319723725318909},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.62800133228302},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5402969121932983},{"id":"https://openalex.org/keywords/filter-bank","display_name":"Filter bank","score":0.47619855403900146},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.43534818291664124},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43057894706726074},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4179359972476959},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.08473888039588928},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.08366701006889343}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.691847562789917},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.6785414218902588},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.6699920296669006},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6319723725318909},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.62800133228302},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5402969121932983},{"id":"https://openalex.org/C100515483","wikidata":"https://www.wikidata.org/wiki/Q3268235","display_name":"Filter bank","level":3,"score":0.47619855403900146},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.43534818291664124},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43057894706726074},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4179359972476959},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.08473888039588928},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.08366701006889343},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C554190296","wikidata":"https://www.wikidata.org/wiki/Q47528","display_name":"Radar","level":2,"score":0.0},{"id":"https://openalex.org/C19165224","wikidata":"https://www.wikidata.org/wiki/Q23404","display_name":"Anthropology","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":5,"locations":[{"id":"doi:10.21437/interspeech.2018-2414","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2018-2414","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2018","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1806.07098","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1806.07098","pdf_url":"https://arxiv.org/pdf/1806.07098","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"mag:2808998745","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/1806.07098.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"pmh:oai:HAL:hal-01888739v1","is_oa":true,"landing_page_url":"https://hal.science/hal-01888739","pdf_url":null,"source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Interspeech 2018, Sep 2018, Hyderabad, India. &#x27E8;10.21437/Interspeech.2018-2414&#x27E9;","raw_type":"Conference papers"},{"id":"doi:10.48550/arxiv.1806.07098","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1806.07098","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1806.07098","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1806.07098","pdf_url":"https://arxiv.org/pdf/1806.07098","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W1542280630","https://openalex.org/W1634752325","https://openalex.org/W1736701665","https://openalex.org/W2093231248","https://openalex.org/W2102113734","https://openalex.org/W2193413348","https://openalex.org/W2284050935","https://openalex.org/W2331927446","https://openalex.org/W2408093180","https://openalex.org/W2502312327","https://openalex.org/W2508048623","https://openalex.org/W2520160253","https://openalex.org/W2526425061","https://openalex.org/W2567070169","https://openalex.org/W2760599032","https://openalex.org/W2766572290","https://openalex.org/W2767554854","https://openalex.org/W2782451907","https://openalex.org/W2949382160","https://openalex.org/W2950903920","https://openalex.org/W2952998108","https://openalex.org/W2963727906"],"related_works":["https://openalex.org/W2767554854","https://openalex.org/W2127141656","https://openalex.org/W3124414150","https://openalex.org/W1987510030","https://openalex.org/W2102113734","https://openalex.org/W606219753","https://openalex.org/W2969344964","https://openalex.org/W3089432592","https://openalex.org/W3003766820","https://openalex.org/W2935891278","https://openalex.org/W3092832084","https://openalex.org/W2517770866","https://openalex.org/W2352579185","https://openalex.org/W3171442995","https://openalex.org/W2928550135","https://openalex.org/W2975176027","https://openalex.org/W2950705786","https://openalex.org/W3168043446","https://openalex.org/W2903799412","https://openalex.org/W2902340037"],"abstract_inverted_index":{"State-of-the-art":[0],"speech":[1],"recognition":[2],"systems":[3,26],"rely":[4],"on":[5,34,88,108,188],"fixed,":[6],"hand-crafted":[7],"features":[8],"such":[9],"as":[10],"mel-filterbanks":[11,41,187],"to":[12,79,86,126,170],"preprocess":[13],"the":[14,17,30,64,68,89,98,109,116,119,127,144,166,175,182],"waveform":[15],"before":[16],"training":[18,117],"pipeline.":[19],"In":[20,154],"this":[21],"paper,":[22],"we":[23,156],"study":[24],"end-to-end":[25,178],"trained":[27,180],"directly":[28],"from":[29,181],"raw":[31,183],"waveform,":[32],"building":[33],"two":[35,77],"alternatives":[36],"for":[37,139,146],"trainable":[38,111,152,167],"replacements":[39],"of":[40,100,118,165],"that":[42],"use":[43],"a":[44,147,158,189],"convolutional":[45],"architecture.":[46],"The":[47,94,122],"first":[48,95,176],"one":[49,66,124],"is":[50,97,174],"inspired":[51],"by":[52,67],"gammatone":[53],"filterbanks":[54,112,168],"(Hoshen":[55],"et":[56,60,72],"al.,":[57,73],"2015;":[58],"Sainath":[59],"al,":[61],"2015),":[62],"and":[63,82,113,142],"second":[65,123],"scattering":[69],"transform":[70],"(Zeghidour":[71],"2017).":[74],"We":[75],"propose":[76],"modifications":[78,135],"these":[80,132],"architectures":[81],"systematically":[83],"compare":[84],"them":[85],"mel-filterbanks,":[87],"Wall":[90],"Street":[91],"Journal":[92],"dataset.":[93],"modification":[96],"addition":[99],"an":[101],"instance":[102],"normalization":[103],"layer,":[104],"which":[105],"greatly":[106],"improves":[107],"gammatone-based":[110],"speeds":[114],"up":[115],"scattering-based":[120,151],"filterbanks.":[121,153],"relates":[125],"low-pass":[128],"filter":[129],"used":[130],"in":[131,150,161],"approaches.":[133],"These":[134],"consistently":[136],"improve":[137],"performances":[138],"both":[140],"approaches,":[141],"remove":[143],"need":[145],"careful":[148],"initialization":[149],"particular,":[155],"show":[157],"consistent":[159],"improvement":[160],"word":[162],"error":[163],"rate":[164],"relatively":[169],"comparable":[171],"mel-filterbanks.":[172],"It":[173],"time":[177],"models":[179],"signal":[184],"significantly":[185],"outperform":[186],"large":[190],"vocabulary":[191],"task":[192],"under":[193],"clean":[194],"recording":[195],"conditions.":[196]},"counts_by_year":[{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":4},{"year":2019,"cited_by_count":6},{"year":2018,"cited_by_count":1}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
