{"id":"https://openalex.org/W7165687132","doi":"https://doi.org/10.48550/arxiv.2603.12837","title":"Mask2Flow-TSE: Two-Stage Target Speaker Extraction with Masking and Flow Matching","display_name":"Mask2Flow-TSE: Two-Stage Target Speaker Extraction with Masking and Flow Matching","publication_year":2026,"publication_date":"2026-03-13","ids":{"openalex":"https://openalex.org/W7165687132","doi":"https://doi.org/10.48550/arxiv.2603.12837"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.12837","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12837","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.12837","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5139192076","display_name":"Junwon Moon","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Moon, Junwon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5139142811","display_name":"Seungbeom Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Seungbeom","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126322125","display_name":"Hansol Park","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Park, Hansol","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129665296","display_name":"Hyunjin Choi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Choi, Hyunjin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112439272","display_name":"Hoseong Ahn","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ahn, Hoseong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5139130787","display_name":"Heeseung Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Heeseung","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5139205925","display_name":"Kyuhong Shim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shim, Kyuhong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8896999955177307,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8896999955177307,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.07329999655485153,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.007799999788403511,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.513700008392334},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.44589999318122864},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.43549999594688416},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4302000105381012},{"id":"https://openalex.org/keywords/masking","display_name":"Masking (illustration)","score":0.42669999599456787},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.3880999982357025},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.35929998755455017},{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.3544999957084656},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.3515999913215637}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8118000030517578},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6980999708175659},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.513700008392334},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.44589999318122864},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43950000405311584},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.43549999594688416},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4302000105381012},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.42669999599456787},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.3880999982357025},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.35929998755455017},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.3544999957084656},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.3515999913215637},{"id":"https://openalex.org/C3020376581","wikidata":"https://www.wikidata.org/wiki/Q16866784","display_name":"Single stage","level":2,"score":0.32679998874664307},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3260999917984009},{"id":"https://openalex.org/C104267543","wikidata":"https://www.wikidata.org/wiki/Q208163","display_name":"Signal processing","level":3,"score":0.31610000133514404},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.31130000948905945},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.305400013923645},{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.29739999771118164},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.27399998903274536},{"id":"https://openalex.org/C38349280","wikidata":"https://www.wikidata.org/wiki/Q1434290","display_name":"Flow (mathematics)","level":2,"score":0.27250000834465027},{"id":"https://openalex.org/C102894143","wikidata":"https://www.wikidata.org/wiki/Q1323979","display_name":"Monaural","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C100515483","wikidata":"https://www.wikidata.org/wiki/Q3268235","display_name":"Filter bank","level":3,"score":0.2605000138282776},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.2596000134944916},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.12837","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12837","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.12837","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12837","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Target":[0],"speaker":[1,150],"extraction":[2],"(TSE)":[3],"extracts":[4],"the":[5,38,70,77,110,120,127,134,138,164],"target":[6],"speaker's":[7],"voice":[8],"from":[9,25,104,157,163],"overlapping":[10],"speech":[11,48],"given":[12],"a":[13,66,106,115,142,171],"reference":[14],"utterance.":[15],"Existing":[16],"masking-based":[17,107],"approaches":[18,154],"are":[19],"lightweight":[20,131],"and":[21,61,186],"effective":[22],"but":[23,51],"suffer":[24],"an":[26,81],"inability":[27],"to":[28,33],"synthesize":[29,46,95],"missing":[30],"content,":[31],"leading":[32],"degraded":[34],"perceptual":[35],"quality.":[36],"On":[37],"other":[39],"hand,":[40],"recent":[41],"generative":[42],"TSE":[43,148],"models":[44],"typically":[45],"high-quality":[47,124,168,180],"with":[49,149,182,194],"diffusion,":[50],"require":[52],"numerous":[53],"iterative":[54],"steps":[55,88],"resulting":[56],"in":[57,170],"high":[58],"computational":[59],"costs":[60],"latency.":[62],"We":[63,75],"propose":[64],"Mask2Flow-TSE,":[65],"two-stage":[67],"framework":[68],"combining":[69],"strengths":[71],"of":[72],"both":[73],"paradigms.":[74],"introduce":[76],"deletion/insertion":[78],"(D/I)":[79],"proportion,":[80],"analytical":[82],"tool":[83],"that":[84,155,177],"reveals":[85],"early":[86,112],"flow":[87],"predominantly":[89],"remove":[90],"signal":[91],"components":[92],"rather":[93],"than":[94],"them.":[96],"Based":[97],"on":[98],"this":[99],"finding,":[100],"we":[101],"decouple":[102],"deletion":[103],"insertion:":[105],"module":[108],"handles":[109],"deletion-dominant":[111],"steps,":[113],"while":[114,137,189],"single":[116,172],"flow-matching":[117],"step":[118],"performs":[119],"remaining":[121],"insertion":[122],"for":[123,133,147],"reconstruction.":[125],"Specifically,":[126],"first":[128],"stage":[129,140],"uses":[130],"convolution":[132],"masking":[135],"module,":[136],"second":[139],"employs":[141],"Diffusion":[143],"Transformer":[144],"(DiT)":[145],"adapted":[146],"conditioning.":[151],"Unlike":[152],"prior":[153],"start":[156],"Gaussian":[158],"noise,":[159],"our":[160],"method":[161],"starts":[162],"masked":[165],"spectrogram,":[166],"enabling":[167],"reconstruction":[169],"inference":[173],"step.":[174],"Experiments":[175],"show":[176],"Mask2Flow-TSE":[178],"produces":[179],"extractions":[181],"only":[183],"85M":[184],"parameters":[185],"one-step":[187],"inference,":[188],"preserving":[190],"clean":[191],"single-speaker":[192],"inputs":[193],"minimal":[195],"degradation.":[196]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-06-24T00:00:00"}
