{"id":"https://openalex.org/W7125604262","doi":"https://doi.org/10.1109/lsp.2026.3657998","title":"Target Speaker Extraction Using Multi-Stage Cross-Attention and Frequency-Wise State Initialization","display_name":"Target Speaker Extraction Using Multi-Stage Cross-Attention and Frequency-Wise State Initialization","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7125604262","doi":"https://doi.org/10.1109/lsp.2026.3657998"},"language":null,"primary_location":{"id":"doi:10.1109/lsp.2026.3657998","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2026.3657998","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123738111","display_name":"Hyeonseung Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I39534123","display_name":"Gwangju Institute of Science and Technology","ror":"https://ror.org/024kbgz78","country_code":"KR","type":"education","lineage":["https://openalex.org/I39534123"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Hyeonseung Kim","raw_affiliation_strings":["Department of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea"],"raw_orcid":"https://orcid.org/0009-0003-0366-4611","affiliations":[{"raw_affiliation_string":"Department of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea","institution_ids":["https://openalex.org/I39534123"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5123745114","display_name":"Jong Won Shin","orcid":null},"institutions":[{"id":"https://openalex.org/I39534123","display_name":"Gwangju Institute of Science and Technology","ror":"https://ror.org/024kbgz78","country_code":"KR","type":"education","lineage":["https://openalex.org/I39534123"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jong Won Shin","raw_affiliation_strings":["Department of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea"],"raw_orcid":"https://orcid.org/0000-0002-8910-0264","affiliations":[{"raw_affiliation_string":"Department of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea","institution_ids":["https://openalex.org/I39534123"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.10392167,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"33","issue":null,"first_page":"773","last_page":"777"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.632099986076355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.632099986076355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.34279999136924744,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.0044999998062849045,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.7340999841690063},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.564300000667572},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5378000140190125},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4966000020503998},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.4578000009059906},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.4496000111103058},{"id":"https://openalex.org/keywords/frequency-domain","display_name":"Frequency domain","score":0.4223000109195709},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4027999937534332}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7605000138282776},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.7340999841690063},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7186999917030334},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.564300000667572},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5378000140190125},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4966000020503998},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.4578000009059906},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.4496000111103058},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4334999918937683},{"id":"https://openalex.org/C19118579","wikidata":"https://www.wikidata.org/wiki/Q786423","display_name":"Frequency domain","level":2,"score":0.4223000109195709},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4027999937534332},{"id":"https://openalex.org/C103734657","wikidata":"https://www.wikidata.org/wiki/Q2739975","display_name":"PESQ","level":4,"score":0.39399999380111694},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.37400001287460327},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.3409000039100647},{"id":"https://openalex.org/C157138929","wikidata":"https://www.wikidata.org/wiki/Q570","display_name":"Loudspeaker","level":2,"score":0.3402999937534332},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.33500000834465027},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.32820001244544983},{"id":"https://openalex.org/C2982762665","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker verification","level":3,"score":0.31869998574256897},{"id":"https://openalex.org/C117220453","wikidata":"https://www.wikidata.org/wiki/Q5172842","display_name":"Correlation","level":2,"score":0.3001999855041504},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.27810001373291016},{"id":"https://openalex.org/C142433447","wikidata":"https://www.wikidata.org/wiki/Q7806653","display_name":"Time\u2013frequency analysis","level":3,"score":0.27630001306533813},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2703999876976013},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.2551000118255615}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lsp.2026.3657998","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2026.3657998","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.814653754234314,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G8227940703","display_name":null,"funder_award_id":"IITP-2025-RS-2021-II211835","funder_id":"https://openalex.org/F4320328359","funder_display_name":"Ministry of Science and ICT, South Korea"}],"funders":[{"id":"https://openalex.org/F4320328359","display_name":"Ministry of Science and ICT, South Korea","ror":"https://ror.org/01wpjm123"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1552314771","https://openalex.org/W2127851351","https://openalex.org/W2734774145","https://openalex.org/W2938646939","https://openalex.org/W2952218014","https://openalex.org/W2962715207","https://openalex.org/W2964058413","https://openalex.org/W2972541922","https://openalex.org/W3015191643","https://openalex.org/W3015199127","https://openalex.org/W3016361963","https://openalex.org/W3096893582","https://openalex.org/W3097653961","https://openalex.org/W3097797867","https://openalex.org/W3162534564","https://openalex.org/W3198234746","https://openalex.org/W4224936432","https://openalex.org/W4224938485","https://openalex.org/W4372260086","https://openalex.org/W4372271325","https://openalex.org/W4375869119","https://openalex.org/W4385756463","https://openalex.org/W4385822827","https://openalex.org/W4385822829","https://openalex.org/W4392902910","https://openalex.org/W4392902992","https://openalex.org/W4392903066","https://openalex.org/W4393863141","https://openalex.org/W4400105722","https://openalex.org/W4401416441","https://openalex.org/W4406461495","https://openalex.org/W4408352717","https://openalex.org/W4410639153","https://openalex.org/W4413125141"],"related_works":[],"abstract_inverted_index":{"Several":[0],"recent":[1],"target":[2],"speaker":[3,15,23,32,112],"extraction":[4,33],"(TSE)":[5],"models":[6],"directly":[7],"utilize":[8],"enrollment":[9,105,152],"speech":[10,68,106],"without":[11,117],"explicitly":[12],"extracting":[13],"low-dimensional":[14],"embeddings.":[16],"However,":[17],"these":[18],"methods":[19],"typically":[20],"inject":[21],"the":[22,28,31,40,63,75,83,87,91,104,108,125,131,138,151,155,158,169,185,190],"information":[24,42,113],"only":[25],"once":[26],"at":[27],"input":[29],"of":[30,137,157],"network,":[34],"which":[35,65],"may":[36,161],"be":[37,162,179],"insufficient":[38],"because":[39],"conditioning":[41],"can":[43],"become":[44],"diluted":[45],"as":[46],"it":[47],"propagates":[48],"through":[49],"repeated":[50],"separator":[51,97],"blocks.":[52,128],"In":[53,82,129],"this":[54],"letter,":[55],"we":[56],"propose":[57],"a":[58,67],"TSE":[59,85,187],"model":[60,70,188],"built":[61],"upon":[62],"TF-GridNet,":[64],"is":[66],"separation":[69],"performing":[71],"dual-path":[72],"modeling":[73],"in":[74,90,114],"time-frequency":[76],"domain":[77],"with":[78,124,197],"cross-frame":[79],"self-attention":[80,88],"modules.":[81],"proposed":[84,186],"model,":[86],"modules":[89,144],"first":[92],"<inline-formula":[93],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[94],"xmlns:xlink=\"http://www.w3.org/1999/xlink\"><tex-math":[95],"notation=\"LaTeX\">$M$</tex-math></inline-formula>":[96],"blocks":[98],"are":[99,145],"replaced":[100],"by":[101],"cross-attention":[102],"between":[103],"and":[107,134,171,194],"mixture":[109],"signal,":[110],"providing":[111],"multiple":[115],"stages":[116],"introducing":[118],"additional":[119],"parameters":[120],"or":[121],"computation":[122],"compared":[123],"original":[126],"TF-GridNet":[127],"addition,":[130],"initial":[132],"hidden":[133],"cell":[135],"states":[136],"inter-frame":[139],"long":[140],"short-term":[141],"memory":[142],"(LSTM)":[143],"determined":[146],"for":[147,164],"each":[148,165],"frequency":[149,166],"from":[150],"speech.":[153],"As":[154],"pattern":[156],"temporal":[159],"correlation":[160],"different":[163],"depending":[167],"on":[168],"pitch":[170],"speaking":[172],"style,":[173],"speaker-dependent":[174],"frequency-wise":[175],"state":[176],"initialization":[177],"would":[178],"helpful.":[180],"Experimental":[181],"results":[182],"showed":[183],"that":[184],"demonstrated":[189],"best":[191],"PESQ":[192],"scores":[193],"comparable":[195],"SI-SDRs":[196],"lower":[198],"computational":[199],"complexity.":[200]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-25T00:00:00"}
