{"id":"https://openalex.org/W4226021932","doi":"https://doi.org/10.21437/interspeech.2022-176","title":"Target Confusion in End-to-end Speaker Extraction: Analysis and Approaches","display_name":"Target Confusion in End-to-end Speaker Extraction: Analysis and Approaches","publication_year":2022,"publication_date":"2022-09-16","ids":{"openalex":"https://openalex.org/W4226021932","doi":"https://doi.org/10.21437/interspeech.2022-176"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2022-176","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-176","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100962843","display_name":"Zifeng Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zifeng Zhao","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043555011","display_name":"Dongchao Yang","orcid":"https://orcid.org/0000-0002-8905-224X"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dongchao Yang","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038895203","display_name":"Rongzhi Gu","orcid":"https://orcid.org/0000-0003-1861-9170"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rongzhi Gu","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100340470","display_name":"Haoran Zhang","orcid":"https://orcid.org/0000-0001-5944-8641"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoran Zhang","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5002795838","display_name":"Yuexian Zou","orcid":"https://orcid.org/0000-0001-9999-6140"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuexian Zou","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100962843"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":1.8776,"has_fulltext":false,"cited_by_count":20,"citation_normalized_percentile":{"value":0.8754107,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"5333","last_page":"5337"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9879999756813049,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.7702186107635498},{"id":"https://openalex.org/keywords/confusion","display_name":"Confusion","score":0.6721457242965698},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5884155035018921},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5257447957992554},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.44365009665489197},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.24444225430488586},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.10137411952018738}],"concepts":[{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.7702186107635498},{"id":"https://openalex.org/C2781140086","wikidata":"https://www.wikidata.org/wiki/Q557945","display_name":"Confusion","level":2,"score":0.6721457242965698},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5884155035018921},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5257447957992554},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.44365009665489197},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.24444225430488586},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.10137411952018738},{"id":"https://openalex.org/C11171543","wikidata":"https://www.wikidata.org/wiki/Q41630","display_name":"Psychoanalysis","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2022-176","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-176","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7900000214576721,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W1552314771","https://openalex.org/W2460742184","https://openalex.org/W2601450892","https://openalex.org/W2734774145","https://openalex.org/W2802973008","https://openalex.org/W2808631503","https://openalex.org/W2952218014","https://openalex.org/W2962866211","https://openalex.org/W2963371159","https://openalex.org/W2963386851","https://openalex.org/W2964058413","https://openalex.org/W3013020904","https://openalex.org/W3015636705","https://openalex.org/W3024869864","https://openalex.org/W3027008958","https://openalex.org/W3094806148","https://openalex.org/W3097653961","https://openalex.org/W3097797867","https://openalex.org/W3103434036","https://openalex.org/W3162534564","https://openalex.org/W4226115251"],"related_works":["https://openalex.org/W1491159402","https://openalex.org/W4297807400","https://openalex.org/W4313854686","https://openalex.org/W2499802997","https://openalex.org/W3162054169","https://openalex.org/W1813780412","https://openalex.org/W289407349","https://openalex.org/W2029134149","https://openalex.org/W2368768466","https://openalex.org/W2757081366"],"abstract_inverted_index":{"Recently,":[0],"end-to-end":[1],"speaker":[2,35,41,106],"extraction":[3,56],"has":[4],"attracted":[5],"increasing":[6],"attention":[7],"and":[8,51,81,134,168],"shown":[9],"promising":[10],"results.However,":[11],"its":[12],"performance":[13,151],"is":[14,114],"often":[15],"inferior":[16],"to":[17,32,54,64,92,97,116],"that":[18,150],"of":[19,77,101,153,165,172],"a":[20,27,110,146],"blind":[21],"source":[22],"separation":[23,49],"(BSS)":[24],"counterpart":[25],"with":[26],"similar":[28],"network":[29,50],"architecture,":[30],"due":[31],"the":[33,48,60,67,87,99,105,118,129,139,163,170,173],"auxiliary":[34],"encoder":[36],"may":[37,46],"sometimes":[38],"generate":[39],"ambiguous":[40,43],"embeddings.Such":[42],"guidance":[44],"information":[45],"confuse":[47],"hence":[52],"lead":[53],"wrong":[55,119],"results,":[57],"which":[58,138,161],"deteriorates":[59],"overall":[61],"performance.We":[62],"refer":[63],"this":[65,71],"as":[66],"target":[68,141,174],"confusion":[69,125,175],"problem.In":[70],"paper,":[72],"we":[73,90,121],"conduct":[74],"an":[75,79],"analysis":[76],"such":[78],"issue":[80],"solve":[82],"it":[83],"in":[84],"two":[85],"stages.In":[86],"training":[88],"phase,":[89],"propose":[91],"integrate":[93],"metric":[94],"learning":[95],"methods":[96,167],"improve":[98],"distinguishability":[100],"embeddings":[102],"produced":[103],"by":[104,127,145],"encoder.While":[107],"for":[108],"inference,":[109],"novel":[111],"post-filtering":[112],"strategy":[113],"designed":[115],"revise":[117],"results.Specifically,":[120],"first":[122],"identify":[123],"these":[124],"samples":[126],"measuring":[128],"similarities":[130],"between":[131],"output":[132],"estimates":[133],"enrollment":[135],"utterances,":[136],"after":[137],"true":[140],"sources":[142],"are":[143],"recovered":[144],"subtraction":[147],"operation.Experiments":[148],"show":[149],"improvement":[152],"more":[154],"than":[155],"1dB":[156],"SI-SDRi":[157],"can":[158],"be":[159],"brought,":[160],"validates":[162],"effectiveness":[164],"our":[166],"emphasizes":[169],"impact":[171],"problem":[176],"1":[177],".":[178]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":4}],"updated_date":"2026-04-23T06:14:38.165362","created_date":"2025-10-10T00:00:00"}
