{"id":"https://openalex.org/W4297841514","doi":"https://doi.org/10.21437/interspeech.2022-10843","title":"Directed speech separation for automatic speech recognition of long form conversational speech","display_name":"Directed speech separation for automatic speech recognition of long form conversational speech","publication_year":2022,"publication_date":"2022-09-16","ids":{"openalex":"https://openalex.org/W4297841514","doi":"https://doi.org/10.21437/interspeech.2022-10843"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2022-10843","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-10843","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052189588","display_name":"Rohit Paturi","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Rohit Paturi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047128701","display_name":"Sundararajan Srinivasan","orcid":"https://orcid.org/0000-0002-3387-9889"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sundararajan Srinivasan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050325468","display_name":"Katrin Kirchhoff","orcid":"https://orcid.org/0000-0002-6645-6030"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Katrin Kirchhoff","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5000143881","display_name":"Daniel Garcia-Romero","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daniel Garcia-Romero","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5052189588"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.3682,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.51248025,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"5388","last_page":"5392"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7814791798591614},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7492363452911377},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4753815233707428},{"id":"https://openalex.org/keywords/speech-technology","display_name":"Speech technology","score":0.4320598840713501},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4277377426624298},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.4161268472671509},{"id":"https://openalex.org/keywords/speech-corpus","display_name":"Speech corpus","score":0.41191366314888},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.39611995220184326},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3249000310897827}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7814791798591614},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7492363452911377},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4753815233707428},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.4320598840713501},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4277377426624298},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.4161268472671509},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.41191366314888},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.39611995220184326},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3249000310897827}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2022-10843","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-10843","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W1524333225","https://openalex.org/W2221409856","https://openalex.org/W2533523411","https://openalex.org/W2734774145","https://openalex.org/W2735663686","https://openalex.org/W2757672955","https://openalex.org/W2884797218","https://openalex.org/W2889503488","https://openalex.org/W2891405874","https://openalex.org/W2943934118","https://openalex.org/W2951130829","https://openalex.org/W2952218014","https://openalex.org/W2962866211","https://openalex.org/W2963266252","https://openalex.org/W2963470929","https://openalex.org/W2964058413","https://openalex.org/W2973062255","https://openalex.org/W3013020904","https://openalex.org/W3015199127","https://openalex.org/W3015700067","https://openalex.org/W3016232124","https://openalex.org/W3027008958","https://openalex.org/W3035268204","https://openalex.org/W3096893582","https://openalex.org/W3097643313","https://openalex.org/W3112929926","https://openalex.org/W3143843080","https://openalex.org/W3162354890","https://openalex.org/W3163652268","https://openalex.org/W3185109982","https://openalex.org/W3191555358","https://openalex.org/W3197381252","https://openalex.org/W3200955206","https://openalex.org/W4287119999","https://openalex.org/W4287240843"],"related_works":["https://openalex.org/W4200068392","https://openalex.org/W2184371793","https://openalex.org/W2550171623","https://openalex.org/W642007152","https://openalex.org/W2537969829","https://openalex.org/W596245619","https://openalex.org/W2008120082","https://openalex.org/W2184127972","https://openalex.org/W4388404911","https://openalex.org/W134179020"],"abstract_inverted_index":{"Many":[0],"of":[1,14,21,23,43,53,78,122],"the":[2,33,44,51,109,120,123,127],"recent":[3],"advances":[4],"in":[5,63,162],"speech":[6,35,55],"separation":[7],"are":[8,81],"primarily":[9],"aimed":[10],"at":[11],"synthetic":[12,84,143],"mixtures":[13,85,144],"short":[15],"audio":[16],"utterances":[17],"with":[18,83,140],"high":[19],"degrees":[20],"overlap.Most":[22],"these":[24,79],"approaches":[25,45],"need":[26,128],"an":[27,113,130],"additional":[28,131],"stitching":[29,65,132],"step":[30],"to":[31,61,90,148],"stitch":[32],"separated":[34,54,124],"chunks":[36,56,68,125],"for":[37,69,129],"long":[38],"form":[39],"audio.":[40],"Since":[41],"most":[42,77],"involve":[46],"Permutation":[47],"Invariant":[48],"training":[49],"(PIT),":[50],"order":[52,121],"is":[57],"nondeterministic":[58],"and":[59,86,142,154],"leads":[60],"difficulty":[62],"accurately":[64],"homogenous":[66],"speaker":[67,99,104],"downstream":[70],"tasks":[71],"like":[72],"Automatic":[73],"Speech":[74],"Recognition":[75],"(ASR).Also,":[76],"models":[80],"trained":[82,102],"do":[87],"not":[88],"generalize":[89],"real":[91,141,149],"conversational":[92],"data.In":[93],"this":[94,152],"paper,":[95],"we":[96,158],"propose":[97],"a":[98,136],"conditioned":[100],"separator":[101],"on":[103,168],"embeddings":[105],"extracted":[106],"directly":[107],"from":[108],"mixed":[110],"signal":[111],"using":[112],"over-clustering":[114],"based":[115],"approach.This":[116],"model":[117,153],"naturally":[118],"regulates":[119],"without":[126],"step.We":[133],"also":[134],"introduce":[135],"data":[137,155],"sampling":[138,156],"strategy":[139],"which":[145],"generalizes":[146],"well":[147],"conversation":[150],"speech.With":[151],"technique,":[157],"show":[159],"significant":[160],"improvements":[161],"speaker-attributed":[163],"word":[164],"error":[165],"rate":[166],"(SA-WER)":[167],"Hub5":[169],"data.":[170]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2023,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
