{"id":"https://openalex.org/W3016361963","doi":"https://doi.org/10.1109/taslp.2020.2987429","title":"SpEx: Multi-Scale Time Domain Speaker Extraction Network","display_name":"SpEx: Multi-Scale Time Domain Speaker Extraction Network","publication_year":2020,"publication_date":"2020-01-01","ids":{"openalex":"https://openalex.org/W3016361963","doi":"https://doi.org/10.1109/taslp.2020.2987429","mag":"3016361963"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2020.2987429","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2020.2987429","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/8938144/09067003.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://ieeexplore.ieee.org/ielx7/6570655/8938144/09067003.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Chenglin Xu","orcid":"https://orcid.org/0000-0002-1584-6282"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Chenglin Xu","raw_affiliation_strings":["School of Computer Science and Engineering and Temasek Laboratories @ NTU, Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-1584-6282","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering and Temasek Laboratories @ NTU, Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Wei Rao","orcid":"https://orcid.org/0000-0002-7237-0874"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Wei Rao","raw_affiliation_strings":["Department of Electrical and Computer Engineering, National University of Singapore, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-7237-0874","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Eng Siong Chng","orcid":"https://orcid.org/0000-0001-6257-7399"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Eng Siong Chng","raw_affiliation_strings":["School of Computer Science and Engineering and Temasek Laboratories @ NTU, Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0001-6257-7399","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering and Temasek Laboratories @ NTU, Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":null,"display_name":"Haizhou Li","orcid":"https://orcid.org/0000-0001-9158-9401"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]},{"id":"https://openalex.org/I180437899","display_name":"University of Bremen","ror":"https://ror.org/04ers2y35","country_code":"DE","type":"education","lineage":["https://openalex.org/I180437899"]},{"id":"https://openalex.org/I191208505","display_name":"Xiamen University","ror":"https://ror.org/00mcjh785","country_code":"CN","type":"education","lineage":["https://openalex.org/I191208505"]}],"countries":["CN","DE","SG"],"is_corresponding":false,"raw_author_name":"Haizhou Li","raw_affiliation_strings":["Department of Electrical and Computer Engineering, National University of Singapore, Singapore, Singapore","Kriston AI Lab, Xiamen, China","University of Bremen, Bremen, Germany"],"raw_orcid":"https://orcid.org/0000-0001-9158-9401","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]},{"raw_affiliation_string":"Kriston AI Lab, Xiamen, China","institution_ids":["https://openalex.org/I191208505"]},{"raw_affiliation_string":"University of Bremen, Bremen, Germany","institution_ids":["https://openalex.org/I180437899"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":9.8848,"has_fulltext":true,"cited_by_count":151,"citation_normalized_percentile":{"value":0.98884686,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":"28","issue":null,"first_page":"1370","last_page":"1384"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.7853000164031982,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.7853000164031982,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.15029999613761902,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.007699999958276749,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6344000101089478},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.5590999722480774},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4499000012874603},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.4498000144958496},{"id":"https://openalex.org/keywords/signal","display_name":"SIGNAL (programming language)","score":0.4262999892234802},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.40939998626708984},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.4059000015258789},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.39989998936653137},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.39989998936653137}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7681000232696533},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6769999861717224},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6344000101089478},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.5590999722480774},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4499000012874603},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.4498000144958496},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.4262999892234802},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.40939998626708984},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.4059000015258789},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.39989998936653137},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.39989998936653137},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39340001344680786},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.37929999828338623},{"id":"https://openalex.org/C103824480","wikidata":"https://www.wikidata.org/wiki/Q185889","display_name":"Time domain","level":2,"score":0.366100013256073},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.32690000534057617},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3075000047683716},{"id":"https://openalex.org/C117978034","wikidata":"https://www.wikidata.org/wiki/Q5422192","display_name":"Extractor","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C19118579","wikidata":"https://www.wikidata.org/wiki/Q786423","display_name":"Frequency domain","level":2,"score":0.271699994802475},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2703000009059906},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.267300009727478},{"id":"https://openalex.org/C103734657","wikidata":"https://www.wikidata.org/wiki/Q2739975","display_name":"PESQ","level":4,"score":0.265500009059906},{"id":"https://openalex.org/C44280652","wikidata":"https://www.wikidata.org/wiki/Q104837","display_name":"Phase (matter)","level":2,"score":0.26010000705718994},{"id":"https://openalex.org/C104267543","wikidata":"https://www.wikidata.org/wiki/Q208163","display_name":"Signal processing","level":3,"score":0.2572999894618988},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.2549999952316284},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/taslp.2020.2987429","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2020.2987429","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/8938144/09067003.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2004.08326","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2004.08326","pdf_url":"https://arxiv.org/pdf/2004.08326","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1109/taslp.2020.2987429","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2020.2987429","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/8938144/09067003.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3191157935","display_name":null,"funder_award_id":"AISG-100E-2018-006","funder_id":"https://openalex.org/F4320320709","funder_display_name":"National Research Foundation Singapore"},{"id":"https://openalex.org/G4092667103","display_name":null,"funder_award_id":"A1687b0033","funder_id":"https://openalex.org/F4320320709","funder_display_name":"National Research Foundation Singapore"}],"funders":[{"id":"https://openalex.org/F4320320671","display_name":"National Research Foundation","ror":"https://ror.org/05s0g1g46"},{"id":"https://openalex.org/F4320320709","display_name":"National Research Foundation Singapore","ror":"https://ror.org/03cpyc314"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3016361963.pdf","grobid_xml":"https://content.openalex.org/works/W3016361963.grobid-xml"},"referenced_works_count":80,"referenced_works":["https://openalex.org/W1482149378","https://openalex.org/W1552314771","https://openalex.org/W1790748249","https://openalex.org/W1946152311","https://openalex.org/W1971146038","https://openalex.org/W1986771495","https://openalex.org/W1991139021","https://openalex.org/W1992316897","https://openalex.org/W2050693797","https://openalex.org/W2059119686","https://openalex.org/W2069681747","https://openalex.org/W2082183045","https://openalex.org/W2086725969","https://openalex.org/W2107938580","https://openalex.org/W2123157731","https://openalex.org/W2124149378","https://openalex.org/W2127851351","https://openalex.org/W2130855892","https://openalex.org/W2136372468","https://openalex.org/W2141411743","https://openalex.org/W2147455188","https://openalex.org/W2150415460","https://openalex.org/W2150769028","https://openalex.org/W2156379660","https://openalex.org/W2158904676","https://openalex.org/W2221409856","https://openalex.org/W2331131332","https://openalex.org/W2397725648","https://openalex.org/W2460742184","https://openalex.org/W2529093176","https://openalex.org/W2547553232","https://openalex.org/W2558649592","https://openalex.org/W2563356726","https://openalex.org/W2594607416","https://openalex.org/W2609317876","https://openalex.org/W2734774145","https://openalex.org/W2735663686","https://openalex.org/W2766901645","https://openalex.org/W2787692317","https://openalex.org/W2800022361","https://openalex.org/W2800664709","https://openalex.org/W2888968865","https://openalex.org/W2889029567","https://openalex.org/W2889540509","https://openalex.org/W2891405874","https://openalex.org/W2891833136","https://openalex.org/W2892163332","https://openalex.org/W2895807593","https://openalex.org/W2924115626","https://openalex.org/W2936302822","https://openalex.org/W2938646939","https://openalex.org/W2939771864","https://openalex.org/W2940275453","https://openalex.org/W2946368709","https://openalex.org/W2952218014","https://openalex.org/W2962715207","https://openalex.org/W2962905190","https://openalex.org/W2962935966","https://openalex.org/W2963045393","https://openalex.org/W2964058413","https://openalex.org/W2964238697","https://openalex.org/W2973062255","https://openalex.org/W3008003372","https://openalex.org/W3015199127","https://openalex.org/W4290457325","https://openalex.org/W6603255414","https://openalex.org/W6607486085","https://openalex.org/W6631190155","https://openalex.org/W6633302090","https://openalex.org/W6677759377","https://openalex.org/W6681474505","https://openalex.org/W6682181234","https://openalex.org/W6702602408","https://openalex.org/W6712560600","https://openalex.org/W6733275487","https://openalex.org/W6746567100","https://openalex.org/W6748559479","https://openalex.org/W6754198086","https://openalex.org/W6754743415","https://openalex.org/W6767696903"],"related_works":[],"abstract_inverted_index":{"Speaker":[0],"extraction":[1,25,63],"aims":[2],"to":[3,22,127],"mimic":[4],"humans'":[5],"selective":[6],"auditory":[7],"attention":[8],"by":[9,48,56],"extracting":[10],"a":[11,16,60,133,152,172,177],"target":[12,130,145,161],"speaker's":[13,162],"voice":[14],"from":[15,33,164],"multi-talker":[17],"environment.":[18],"It":[19],"is":[20,45],"common":[21],"perform":[23],"the":[24,30,34,49,68,78,112,116,123,129,140,156,160,165,185,196],"in":[26,199],"frequency-domain,":[27],"and":[28,37,83,108,144,150,176,191,208],"reconstruct":[29],"time-domain":[31,61],"signal":[32,80],"extracted":[35],"magnitude":[36,82],"estimated":[38],"phase":[39,53,84,91],"spectra.":[40,85],"However,":[41],"such":[42],"an":[43,216],"approach":[44],"adversely":[46],"affected":[47],"inherent":[50],"difficulty":[51],"of":[52,76,97,201,211],"estimation.":[54,92],"Inspired":[55],"Conv-TasNet,":[57],"we":[58,89],"propose":[59,171],"speaker":[62,102,106,124,131,134,137,146],"network":[64,95,99],"(SpEx)":[65],"that":[66,184],"converts":[67,115],"mixture":[69,117],"speech":[70,79,104,109,113,118,157,163,212],"into":[71,81,119],"multi-scale":[72,120,141,178],"embedding":[73,121,142,147,167,179],"coefficients":[74,143],"instead":[75],"decomposing":[77],"In":[86],"this":[87],"way,":[88],"avoid":[90],"The":[93,136],"SpEx":[94,187],"consists":[96],"four":[98],"components,":[100],"namely":[101],"encoder,":[103,105],"extractor,":[107],"decoder.":[110],"Specifically,":[111],"encoder":[114,125],"coefficients,":[122],"learns":[126],"represent":[128],"with":[132],"embedding.":[135],"extractor":[138],"takes":[139],"as":[148],"input":[149],"estimates":[151],"receptive":[153],"mask.":[154],"Finally,":[155],"decoder":[158],"reconstructs":[159],"masked":[166],"coefficients.":[168],"We":[169],"also":[170],"multi-task":[173],"learning":[174],"framework":[175],"implementation.":[180],"Experimental":[181],"results":[182],"show":[183],"proposed":[186],"achieves":[188],"37.3%,":[189],"37.7%":[190],"15.0%":[192],"relative":[193],"improvements":[194],"over":[195],"best":[197],"baseline":[198],"terms":[200],"signal-to-distortion":[202],"ratio":[203],"(SDR),":[204],"scale-invariant":[205],"SDR":[206],"(SI-SDR),":[207],"perceptual":[209],"evaluation":[210,218],"quality":[213],"(PESQ)":[214],"under":[215],"open":[217],"condition.":[219]},"counts_by_year":[{"year":2026,"cited_by_count":13},{"year":2025,"cited_by_count":33},{"year":2024,"cited_by_count":40},{"year":2023,"cited_by_count":26},{"year":2022,"cited_by_count":22},{"year":2021,"cited_by_count":15},{"year":2020,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2020-04-24T00:00:00"}
