{"id":"https://openalex.org/W4224918535","doi":"https://doi.org/10.1109/icassp43922.2022.9746439","title":"Attentionpit: Soft Permutation Invariant Training for Audio Source Separation with Attention Mechanism","display_name":"Attentionpit: Soft Permutation Invariant Training for Audio Source Separation with Attention Mechanism","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W4224918535","doi":"https://doi.org/10.1109/icassp43922.2022.9746439"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9746439","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9746439","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001243214","display_name":"Hirokazu Kameoka","orcid":"https://orcid.org/0000-0003-3102-0162"},"institutions":[{"id":"https://openalex.org/I4210092597","display_name":"NTT (United States)","ror":"https://ror.org/004cn7092","country_code":"US","type":"company","lineage":["https://openalex.org/I2251713219","https://openalex.org/I4210092597"]},{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP","US"],"is_corresponding":true,"raw_author_name":"Hirokazu Kameoka","raw_affiliation_strings":["Nippon Telegraph and Telephone Corporation,NTT Communication Science Laboratories","NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation"],"affiliations":[{"raw_affiliation_string":"Nippon Telegraph and Telephone Corporation,NTT Communication Science Laboratories","institution_ids":["https://openalex.org/I2251713219","https://openalex.org/I4210092597"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101792863","display_name":"Shogo Seki","orcid":"https://orcid.org/0000-0001-8284-188X"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]},{"id":"https://openalex.org/I4210092597","display_name":"NTT (United States)","ror":"https://ror.org/004cn7092","country_code":"US","type":"company","lineage":["https://openalex.org/I2251713219","https://openalex.org/I4210092597"]}],"countries":["JP","US"],"is_corresponding":false,"raw_author_name":"Shogo Seki","raw_affiliation_strings":["Nippon Telegraph and Telephone Corporation,NTT Communication Science Laboratories","NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation"],"affiliations":[{"raw_affiliation_string":"Nippon Telegraph and Telephone Corporation,NTT Communication Science Laboratories","institution_ids":["https://openalex.org/I2251713219","https://openalex.org/I4210092597"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100361230","display_name":"Li Li","orcid":"https://orcid.org/0000-0002-7163-6263"},"institutions":[{"id":"https://openalex.org/I4210092597","display_name":"NTT (United States)","ror":"https://ror.org/004cn7092","country_code":"US","type":"company","lineage":["https://openalex.org/I2251713219","https://openalex.org/I4210092597"]},{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP","US"],"is_corresponding":false,"raw_author_name":"Li Li","raw_affiliation_strings":["Nippon Telegraph and Telephone Corporation,NTT Communication Science Laboratories","NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation"],"affiliations":[{"raw_affiliation_string":"Nippon Telegraph and Telephone Corporation,NTT Communication Science Laboratories","institution_ids":["https://openalex.org/I2251713219","https://openalex.org/I4210092597"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5006890326","display_name":"Chihiro Watanabe","orcid":"https://orcid.org/0000-0002-6676-5685"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]},{"id":"https://openalex.org/I4210092597","display_name":"NTT (United States)","ror":"https://ror.org/004cn7092","country_code":"US","type":"company","lineage":["https://openalex.org/I2251713219","https://openalex.org/I4210092597"]}],"countries":["JP","US"],"is_corresponding":false,"raw_author_name":"Chihiro Watanabe","raw_affiliation_strings":["Nippon Telegraph and Telephone Corporation,NTT Communication Science Laboratories","NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation"],"affiliations":[{"raw_affiliation_string":"Nippon Telegraph and Telephone Corporation,NTT Communication Science Laboratories","institution_ids":["https://openalex.org/I2251713219","https://openalex.org/I4210092597"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, Nippon Telegraph and Telephone Corporation","institution_ids":["https://openalex.org/I2251713219"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5001243214"],"corresponding_institution_ids":["https://openalex.org/I2251713219","https://openalex.org/I4210092597"],"apc_list":null,"apc_paid":null,"fwci":0.2455,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.34439179,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"706","last_page":"710"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6964504718780518},{"id":"https://openalex.org/keywords/permutation","display_name":"Permutation (music)","score":0.6426193118095398},{"id":"https://openalex.org/keywords/differentiable-function","display_name":"Differentiable function","score":0.5742525458335876},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.5419799089431763},{"id":"https://openalex.org/keywords/invariant","display_name":"Invariant (physics)","score":0.5294484496116638},{"id":"https://openalex.org/keywords/backpropagation","display_name":"Backpropagation","score":0.4657619297504425},{"id":"https://openalex.org/keywords/lti-system-theory","display_name":"LTI system theory","score":0.45386362075805664},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4437454342842102},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4254317879676819},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.37173134088516235},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.32687926292419434},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1717328131198883},{"id":"https://openalex.org/keywords/linear-system","display_name":"Linear system","score":0.16122913360595703}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6964504718780518},{"id":"https://openalex.org/C21308566","wikidata":"https://www.wikidata.org/wiki/Q7169365","display_name":"Permutation (music)","level":2,"score":0.6426193118095398},{"id":"https://openalex.org/C202615002","wikidata":"https://www.wikidata.org/wiki/Q783507","display_name":"Differentiable function","level":2,"score":0.5742525458335876},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5419799089431763},{"id":"https://openalex.org/C190470478","wikidata":"https://www.wikidata.org/wiki/Q2370229","display_name":"Invariant (physics)","level":2,"score":0.5294484496116638},{"id":"https://openalex.org/C155032097","wikidata":"https://www.wikidata.org/wiki/Q798503","display_name":"Backpropagation","level":3,"score":0.4657619297504425},{"id":"https://openalex.org/C87698059","wikidata":"https://www.wikidata.org/wiki/Q1808960","display_name":"LTI system theory","level":3,"score":0.45386362075805664},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4437454342842102},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4254317879676819},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.37173134088516235},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.32687926292419434},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1717328131198883},{"id":"https://openalex.org/C6802819","wikidata":"https://www.wikidata.org/wiki/Q1072174","display_name":"Linear system","level":2,"score":0.16122913360595703},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0},{"id":"https://openalex.org/C37914503","wikidata":"https://www.wikidata.org/wiki/Q156495","display_name":"Mathematical physics","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9746439","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9746439","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1522301498","https://openalex.org/W1902237438","https://openalex.org/W2141461755","https://openalex.org/W2163318198","https://openalex.org/W2221409856","https://openalex.org/W2222512263","https://openalex.org/W2460742184","https://openalex.org/W2558649592","https://openalex.org/W2734774145","https://openalex.org/W2952218014","https://openalex.org/W2962905190","https://openalex.org/W2962935966","https://openalex.org/W2972864514","https://openalex.org/W3015199127","https://openalex.org/W3027008958","https://openalex.org/W3042857426","https://openalex.org/W3094380169","https://openalex.org/W3094607766","https://openalex.org/W3095717210","https://openalex.org/W3099330747","https://openalex.org/W3197823486","https://openalex.org/W4385245566","https://openalex.org/W6631190155","https://openalex.org/W6739901393","https://openalex.org/W6777776875"],"related_works":["https://openalex.org/W4285277090","https://openalex.org/W4327738859","https://openalex.org/W4239286941","https://openalex.org/W2088845016","https://openalex.org/W589102260","https://openalex.org/W1966421350","https://openalex.org/W1868434454","https://openalex.org/W4366985237","https://openalex.org/W2952384777","https://openalex.org/W3169827016"],"abstract_inverted_index":{"Permutation":[0],"invariant":[1],"training":[2,176],"(PIT)":[3],"has":[4,90],"recently":[5,165],"attracted":[6],"attention":[7,142],"as":[8,31,102,162,171],"a":[9,23,28,91],"framework":[10],"to":[11,21,54,113,122,144,185,190],"achieve":[12],"end-to-end":[13],"time-domain":[14],"audio":[15],"source":[16,38],"separation.":[17],"Its":[18],"goal":[19],"is":[20,47,53,87,108,111,180,228],"train":[22],"separation":[24,150],"network":[25,65,151,195],"that":[26,73,88,109,206,226],"takes":[27],"mixture":[29],"signal":[30],"input":[32],"and":[33,61,105,153,173,194,223],"produces":[34],"the":[35,41,44,49,57,64,69,106,123,164,202,232],"J":[36,103],"underlying":[37],"signals.":[39],"Since":[40],"order":[42],"of":[43,51,94,178,235],"output":[45],"signals":[46],"arbitrary,":[48],"idea":[50],"PIT":[52,168],"first":[55],"find":[56,145],"best":[58],"output-target":[59,125,147,192],"assignment":[60,74,126,193],"then":[62],"update":[63,197],"parameters":[66],"based":[67],"on":[68,201,212],"error":[70],"given":[71],"by":[72],"at":[75],"each":[76],"iteration.":[77],"However,":[78],"there":[79],"are":[80],"two":[81],"known":[82],"problems":[83,131],"with":[84,163,221],"PIT:":[85],"One":[86],"it":[89,100,110,215],"time":[92,159],"complexity":[93],"$\\mathcal{O}\\left(":[95],"{J!}":[96],"\\right)$,":[97],"which":[98,139],"makes":[99],"infeasible":[101],"increases,":[104],"other":[107],"prone":[112],"getting":[114],"stuck":[115],"in":[116,133,157,160,231],"bad":[117],"local":[118],"optimal":[119],"solutions":[120],"due":[121],"hard":[124],"process.":[127],"To":[128],"overcome":[129],"these":[130],"simultaneously,":[132],"this":[134],"paper,":[135],"we":[136],"propose":[137],"AttentionPIT,":[138],"uses":[140],"an":[141],"mechanism":[143],"soft":[146,191],"assignments":[148],"for":[149],"training,":[152],"can":[154],"be":[155],"run":[156,229],"polynomial":[158],"J,":[161],"proposed":[166],"fast":[167],"variants":[169],"such":[170],"SinkPIT":[172,222],"HungarianPIT.":[174],"The":[175],"loss":[177],"AttentionPIT":[179,208,227],"fully":[181],"differentiable,":[182],"allowing":[183],"us":[184],"simultaneously":[186],"perform":[187],"processes":[188],"corresponding":[189],"parameter":[196],"through":[198],"backpropagation.":[199],"Experiments":[200],"LibriMix":[203],"corpus":[204],"revealed":[205],"while":[207],"works":[209,216],"reasonably":[210],"well":[211],"its":[213],"own,":[214],"even":[217],"better":[218],"when":[219],"combined":[220],"HungarianPIT":[224],"so":[225],"only":[230],"early":[233],"stages":[234],"training.":[236]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
