{"id":"https://openalex.org/W2221409856","doi":"https://doi.org/10.1109/icassp.2016.7471631","title":"Deep clustering: Discriminative embeddings for segmentation and separation","display_name":"Deep clustering: Discriminative embeddings for segmentation and separation","publication_year":2016,"publication_date":"2016-03-01","ids":{"openalex":"https://openalex.org/W2221409856","doi":"https://doi.org/10.1109/icassp.2016.7471631","mag":"2221409856"},"language":"en","primary_location":{"id":"doi:10.1109/icassp.2016.7471631","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2016.7471631","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112763337","display_name":"John R. Hershey","orcid":null},"institutions":[{"id":"https://openalex.org/I4210159266","display_name":"Mitsubishi Electric (United States)","ror":"https://ror.org/053jnhe44","country_code":"US","type":"company","lineage":["https://openalex.org/I1306287861","https://openalex.org/I4210133125","https://openalex.org/I4210159266"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"John R. Hershey","raw_affiliation_strings":["Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA","institution_ids":["https://openalex.org/I4210159266"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100345092","display_name":"Zhuo Chen","orcid":"https://orcid.org/0000-0002-9011-7928"},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhuo Chen","raw_affiliation_strings":["Columbia University, New York, NY, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Columbia University, New York, NY, USA","institution_ids":["https://openalex.org/I78577930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076453358","display_name":"Jonathan Le Roux","orcid":"https://orcid.org/0000-0002-3451-171X"},"institutions":[{"id":"https://openalex.org/I4210159266","display_name":"Mitsubishi Electric (United States)","ror":"https://ror.org/053jnhe44","country_code":"US","type":"company","lineage":["https://openalex.org/I1306287861","https://openalex.org/I4210133125","https://openalex.org/I4210159266"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jonathan Le Roux","raw_affiliation_strings":["Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA","institution_ids":["https://openalex.org/I4210159266"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001291873","display_name":"Shinji Watanabe","orcid":"https://orcid.org/0000-0002-5970-8631"},"institutions":[{"id":"https://openalex.org/I4210159266","display_name":"Mitsubishi Electric (United States)","ror":"https://ror.org/053jnhe44","country_code":"US","type":"company","lineage":["https://openalex.org/I1306287861","https://openalex.org/I4210133125","https://openalex.org/I4210159266"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA","institution_ids":["https://openalex.org/I4210159266"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":93.9345,"has_fulltext":false,"cited_by_count":1427,"citation_normalized_percentile":{"value":0.99994137,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"31","last_page":"35"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11447","display_name":"Blind Source Separation Techniques","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.8284831643104553},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.7521916627883911},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6643806099891663},{"id":"https://openalex.org/keywords/spectral-clustering","display_name":"Spectral clustering","score":0.601268470287323},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6004782915115356},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.5727255344390869},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5596200227737427},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5484212636947632},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5342819690704346},{"id":"https://openalex.org/keywords/channel","display_name":"Channel (broadcasting)","score":0.44290637969970703},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.4217853248119354},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.4117441177368164},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.36420828104019165}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.8284831643104553},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.7521916627883911},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6643806099891663},{"id":"https://openalex.org/C105611402","wikidata":"https://www.wikidata.org/wiki/Q2976589","display_name":"Spectral clustering","level":3,"score":0.601268470287323},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6004782915115356},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.5727255344390869},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5596200227737427},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5484212636947632},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5342819690704346},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.44290637969970703},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.4217853248119354},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.4117441177368164},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.36420828104019165},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp.2016.7471631","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2016.7471631","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.75,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":47,"referenced_works":["https://openalex.org/W80444264","https://openalex.org/W160800111","https://openalex.org/W1513606575","https://openalex.org/W1550027367","https://openalex.org/W1748744376","https://openalex.org/W1790748249","https://openalex.org/W1897240248","https://openalex.org/W1964538581","https://openalex.org/W1989364685","https://openalex.org/W1989954041","https://openalex.org/W2022508996","https://openalex.org/W2060822897","https://openalex.org/W2069681747","https://openalex.org/W2075606107","https://openalex.org/W2078528584","https://openalex.org/W2079362249","https://openalex.org/W2088361146","https://openalex.org/W2094461119","https://openalex.org/W2105340328","https://openalex.org/W2116810533","https://openalex.org/W2120303002","https://openalex.org/W2121947440","https://openalex.org/W2124149378","https://openalex.org/W2127851351","https://openalex.org/W2129116669","https://openalex.org/W2147174722","https://openalex.org/W2149657498","https://openalex.org/W2168379380","https://openalex.org/W2187621885","https://openalex.org/W2221409856","https://openalex.org/W2405933695","https://openalex.org/W2950354455","https://openalex.org/W4235169531","https://openalex.org/W4256399001","https://openalex.org/W4285719527","https://openalex.org/W6630776181","https://openalex.org/W6633022694","https://openalex.org/W6637992904","https://openalex.org/W6668037159","https://openalex.org/W6672600125","https://openalex.org/W6676106683","https://openalex.org/W6677945368","https://openalex.org/W6679489070","https://openalex.org/W6681474505","https://openalex.org/W6684924409","https://openalex.org/W6686900938","https://openalex.org/W6688843265"],"related_works":["https://openalex.org/W2530685530","https://openalex.org/W4375868962","https://openalex.org/W2011227383","https://openalex.org/W2088854863","https://openalex.org/W4402568167","https://openalex.org/W3179495260","https://openalex.org/W1976719989","https://openalex.org/W3127543252","https://openalex.org/W2065606036","https://openalex.org/W2972477132"],"abstract_inverted_index":{"We":[0],"address":[1],"the":[2,74,81,85,89,103,113,125,129,133,141,177],"problem":[3],"of":[4,32,73,84,167,173],"\"cocktail-party\"":[5],"source":[6,49],"separation":[7,21],"in":[8,26,76,101,132],"a":[9,29,37,61,94,106,154],"deep":[10,14,17,62,95],"learning":[11],"framework":[12],"called":[13],"clustering.":[15],"Previous":[16],"network":[18,63],"approaches":[19],"to":[20,36,64,69,78,98,140],"have":[22],"shown":[23],"promising":[24],"performance":[25],"scenarios":[27],"with":[28,138,183],"fixed":[30],"number":[31],"sources,":[33],"each":[34,70],"belonging":[35],"distinct":[38],"signal":[39,163],"class,":[40],"such":[41],"as":[42],"speech":[43],"and":[44,51],"noise.":[45],"However,":[46],"for":[47,165],"arbitrary":[48],"classes":[50],"number,":[52],"\"class-based\"":[53],"methods":[54],"are":[55],"not":[56],"suitable.":[57],"Instead,":[58],"we":[59],"train":[60],"assign":[65],"contrastive":[66],"embedding":[67],"vectors":[68],"time-frequency":[71],"region":[72],"spectrogram":[75,87],"order":[77],"implicitly":[79],"predict":[80],"segmentation":[82,130],"labels":[83],"target":[86],"from":[88,149],"input":[90],"mixtures.":[91,185],"This":[92],"yields":[93],"network-based":[96],"analogue":[97],"spectral":[99],"clustering,":[100],"that":[102,111,153],"embeddings":[104,134],"form":[105],"low-rank":[107],"pair-wise":[108],"affinity":[109,115],"matrix":[110],"approximates":[112],"ideal":[114],"matrix,":[116],"while":[117],"enabling":[118],"much":[119],"faster":[120],"performance.":[121],"At":[122],"test":[123],"time,":[124],"clustering":[126],"step":[127],"\"decodes\"":[128],"implicit":[131],"by":[135,170],"optimizing":[136],"K-means":[137],"respect":[139],"unknown":[142],"assignments.":[143],"Preliminary":[144],"experiments":[145],"on":[146,158],"single-channel":[147],"mixtures":[148,160,166],"multiple":[150],"speakers":[151,169],"show":[152],"speaker-independent":[155],"model":[156,179],"trained":[157],"two-speaker":[159],"can":[161],"improve":[162],"quality":[164],"held-out":[168],"an":[171],"average":[172],"6dB.":[174],"More":[175],"dramatically,":[176],"same":[178],"does":[180],"surprisingly":[181],"well":[182],"three-speaker":[184]},"counts_by_year":[{"year":2026,"cited_by_count":26},{"year":2025,"cited_by_count":105},{"year":2024,"cited_by_count":101},{"year":2023,"cited_by_count":148},{"year":2022,"cited_by_count":160},{"year":2021,"cited_by_count":240},{"year":2020,"cited_by_count":279},{"year":2019,"cited_by_count":213},{"year":2018,"cited_by_count":101},{"year":2017,"cited_by_count":47},{"year":2016,"cited_by_count":6},{"year":2012,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
