{"id":"https://openalex.org/W2738829388","doi":"https://doi.org/10.1109/taslp.2017.2765834","title":"Progressive Joint Modeling in Unsupervised Single-Channel Overlapped Speech Recognition","display_name":"Progressive Joint Modeling in Unsupervised Single-Channel Overlapped Speech Recognition","publication_year":2017,"publication_date":"2017-10-23","ids":{"openalex":"https://openalex.org/W2738829388","doi":"https://doi.org/10.1109/taslp.2017.2765834","mag":"2738829388"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2017.2765834","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2017.2765834","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1707.07048","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhehuai Chen","orcid":"https://orcid.org/0000-0003-4400-5340"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhehuai Chen","raw_affiliation_strings":["Computer Science and Engineering Department, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Computer Science and Engineering Department, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jasha Droppo","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jasha Droppo","raw_affiliation_strings":["Computer Science and Engineering Department, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Computer Science and Engineering Department, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jinyu Li","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Computer Science and Engineering Department, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Computer Science and Engineering Department, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":null,"display_name":"Wayne Xiong","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wayne Xiong","raw_affiliation_strings":["Computer Science and Engineering Department, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Computer Science and Engineering Department, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":3.9463,"has_fulltext":false,"cited_by_count":35,"citation_normalized_percentile":{"value":0.94944178,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":"26","issue":"1","first_page":"184","last_page":"196"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5968999862670898,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5968999862670898,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.37040001153945923,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.007300000172108412,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.7876999974250793},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.527999997138977},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.48919999599456787},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.42500001192092896},{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.4018999934196472},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.38040000200271606},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.37059998512268066},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.3628000020980835},{"id":"https://openalex.org/keywords/permutation","display_name":"Permutation (music)","score":0.35499998927116394}],"concepts":[{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.7876999974250793},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7649000287055969},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6992999911308289},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.527999997138977},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5073999762535095},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.48919999599456787},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.42500001192092896},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.4018999934196472},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.38040000200271606},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.37059998512268066},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.3628000020980835},{"id":"https://openalex.org/C21308566","wikidata":"https://www.wikidata.org/wiki/Q7169365","display_name":"Permutation (music)","level":2,"score":0.35499998927116394},{"id":"https://openalex.org/C150856459","wikidata":"https://www.wikidata.org/wiki/Q8034367","display_name":"Word recognition","level":3,"score":0.3368000090122223},{"id":"https://openalex.org/C2781121602","wikidata":"https://www.wikidata.org/wiki/Q3504403","display_name":"Modular neural network","level":4,"score":0.33219999074935913},{"id":"https://openalex.org/C190470478","wikidata":"https://www.wikidata.org/wiki/Q2370229","display_name":"Invariant (physics)","level":2,"score":0.325300008058548},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.32440000772476196},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.3239000141620636},{"id":"https://openalex.org/C175202392","wikidata":"https://www.wikidata.org/wiki/Q2434543","display_name":"Time delay neural network","level":3,"score":0.3050999939441681},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.3012999892234802},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.27630001306533813},{"id":"https://openalex.org/C35639132","wikidata":"https://www.wikidata.org/wiki/Q7452468","display_name":"Sequence labeling","level":3,"score":0.27480000257492065},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.26600000262260437},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.26350000500679016},{"id":"https://openalex.org/C8038995","wikidata":"https://www.wikidata.org/wiki/Q1152135","display_name":"Unsupervised learning","level":2,"score":0.25380000472068787},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.250900000333786}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/taslp.2017.2765834","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2017.2765834","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:1707.07048","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1707.07048","pdf_url":"https://arxiv.org/pdf/1707.07048","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1707.07048","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1707.07048","pdf_url":"https://arxiv.org/pdf/1707.07048","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W185399533","https://openalex.org/W1600744878","https://openalex.org/W1748744376","https://openalex.org/W1989549063","https://openalex.org/W1991139021","https://openalex.org/W2005708641","https://openalex.org/W2060822897","https://openalex.org/W2062164080","https://openalex.org/W2087368178","https://openalex.org/W2098318492","https://openalex.org/W2117671523","https://openalex.org/W2119203697","https://openalex.org/W2122028591","https://openalex.org/W2125234026","https://openalex.org/W2127141656","https://openalex.org/W2131342762","https://openalex.org/W2144763279","https://openalex.org/W2165698076","https://openalex.org/W2166637769","https://openalex.org/W2221409856","https://openalex.org/W2296073425","https://openalex.org/W2460742184","https://openalex.org/W2512865187","https://openalex.org/W2513383847","https://openalex.org/W2514741789","https://openalex.org/W2515863432","https://openalex.org/W2534495290","https://openalex.org/W2558649592","https://openalex.org/W2610674366","https://openalex.org/W2649558613","https://openalex.org/W2711861986","https://openalex.org/W2962715207","https://openalex.org/W2962894366","https://openalex.org/W2963773971","https://openalex.org/W4232280717","https://openalex.org/W4233392025","https://openalex.org/W6603374476","https://openalex.org/W6679909955","https://openalex.org/W6712847557","https://openalex.org/W6728841359","https://openalex.org/W6739366949","https://openalex.org/W6741874079"],"related_works":[],"abstract_inverted_index":{"Unsupervised":[0],"single-channel":[1],"overlapped":[2,144],"speech":[3,13,93,111,178,182],"recognition":[4,14],"is":[5,20,124],"one":[6],"of":[7,23,48,127,135,157],"the":[8,24,45,49,57,67,82,114,118,136,142,195],"hardest":[9],"problems":[10],"in":[11],"automatic":[12],"(ASR).":[15],"Permutation":[16],"invariant":[17],"training":[18,76,115,122,192],"(PIT)":[19],"a":[21,30,53,61,74,125,163,172],"state":[22,47],"art":[25,50],"model-based":[26],"approach,":[27],"which":[28],"applies":[29],"single":[31],"neural":[32,58],"network":[33],"to":[34,43,101,112],"solve":[35,102],"this":[36],"single-input,":[37],"multiple-output":[38],"modeling":[39],"problem.":[40],"We":[41],"propose":[42],"advance":[44],"current":[46],"by":[51],"imposing":[52],"modular":[54,79],"structure":[55,80],"on":[56,141],"network,":[59],"applying":[60],"progressive":[62],"pretraining":[63,96],"regimen,":[64],"and":[65,73,92,146,171,194],"improving":[66],"objective":[68],"function":[69],"with":[70,180],"transfer":[71],"learning":[72,107],"discriminative":[75,121],"criterion.":[77],"The":[78,95,149,185],"splits":[81],"problem":[83],"into":[84],"three":[85],"subtasks:":[86],"frame-wise":[87],"interpreting,":[88],"utterance-level":[89],"speaker":[90],"tracing,":[91],"recognition.":[94],"regimen":[97],"uses":[98],"these":[99],"modules":[100],"progressively":[103],"harder":[104],"tasks.":[105],"Transfer":[106],"leverages":[108],"parallel":[109],"clean":[110,181],"improve":[113],"targets":[116],"for":[117,169,177],"network.":[119],"Our":[120],"formulation":[123],"modification":[126],"standard":[128],"formulations":[129],"that":[130],"also":[131],"penalizes":[132],"competing":[133],"outputs":[134],"system.":[137],"Experiments":[138],"are":[139],"conducted":[140],"artificial":[143],"switchboard":[145],"hub5e-swb":[147],"dataset.":[148],"proposed":[150],"framework":[151],"achieves":[152],"over":[153,161],"30%":[154],"relative":[155],"improvement":[156,186],"word":[158],"error":[159],"rate":[160],"both":[162],"strong":[164],"jointly":[165],"trained":[166],"system,":[167,175],"PIT":[168,176],"ASR,":[170],"separately":[173],"optimized":[174],"separation":[179],"ASR":[183],"model.":[184],"comes":[187],"from":[188],"better":[189],"model":[190],"generalization,":[191],"efficiency,":[193],"sequence":[196],"level":[197],"linguistic":[198],"knowledge":[199],"integration.":[200]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":5},{"year":2019,"cited_by_count":7},{"year":2018,"cited_by_count":7}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2017-07-31T00:00:00"}
