{"id":"https://openalex.org/W4226329369","doi":"https://doi.org/10.21437/interspeech.2022-433","title":"RaDur: A Reference-aware and Duration-robust Network for Target Sound Detection","display_name":"RaDur: A Reference-aware and Duration-robust Network for Target Sound Detection","publication_year":2022,"publication_date":"2022-09-16","ids":{"openalex":"https://openalex.org/W4226329369","doi":"https://doi.org/10.21437/interspeech.2022-433"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2022-433","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-433","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5043555011","display_name":"Dongchao Yang","orcid":"https://orcid.org/0000-0002-8905-224X"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Dongchao Yang","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101664824","display_name":"Helin Wang","orcid":"https://orcid.org/0000-0001-6088-0378"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Helin Wang","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038853796","display_name":"Zhongjie Ye","orcid":"https://orcid.org/0000-0003-0306-5267"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhongjie Ye","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034401157","display_name":"Yuexian Zou","orcid":"https://orcid.org/0000-0002-0144-1794"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuexian Zou","raw_affiliation_strings":["ADSPLAB, School of ECE, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"ADSPLAB, School of ECE, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100676721","display_name":"Wenwu Wang","orcid":"https://orcid.org/0000-0002-8393-5703"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"WenWu Wang","raw_affiliation_strings":["Center for Vision, Speech and Signal Processing, University of Surrey, UK"],"affiliations":[{"raw_affiliation_string":"Center for Vision, Speech and Signal Processing, University of Surrey, UK","institution_ids":["https://openalex.org/I28290843"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5043555011"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.0217186,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1511","last_page":"1515"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.989300012588501,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7658957839012146},{"id":"https://openalex.org/keywords/duration","display_name":"Duration (music)","score":0.7392706274986267},{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.682121217250824},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6216375827789307},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4879550337791443},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.46401819586753845},{"id":"https://openalex.org/keywords/reference-frame","display_name":"Reference frame","score":0.4184437096118927},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.33076098561286926},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.21362152695655823},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.07169514894485474},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.0702936053276062}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7658957839012146},{"id":"https://openalex.org/C112758219","wikidata":"https://www.wikidata.org/wiki/Q16038819","display_name":"Duration (music)","level":2,"score":0.7392706274986267},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.682121217250824},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6216375827789307},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4879550337791443},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46401819586753845},{"id":"https://openalex.org/C172849965","wikidata":"https://www.wikidata.org/wiki/Q3148875","display_name":"Reference frame","level":3,"score":0.4184437096118927},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.33076098561286926},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.21362152695655823},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.07169514894485474},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0702936053276062},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.21437/interspeech.2022-433","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-433","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"},{"id":"pmh:oai:alma.44SUR_INST:11166009780002346","is_oa":false,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4210197018","display_name":"View","issn_l":"2688-268X","issn":["2688-268X","2688-3988"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320595","host_organization_name":"Wiley","host_organization_lineage":["https://openalex.org/P4310320595"],"host_organization_lineage_names":["Wiley"],"type":"journal"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":""}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.75}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1665214252","https://openalex.org/W2038484192","https://openalex.org/W2408239454","https://openalex.org/W2526050071","https://openalex.org/W2593116425","https://openalex.org/W2884561390","https://openalex.org/W2937959246","https://openalex.org/W2951130829","https://openalex.org/W2963610932","https://openalex.org/W2963723765","https://openalex.org/W2964891022","https://openalex.org/W2973062255","https://openalex.org/W3015205011","https://openalex.org/W3017521796","https://openalex.org/W3049446265","https://openalex.org/W3094550259","https://openalex.org/W3095263845","https://openalex.org/W3095753917","https://openalex.org/W3117290926","https://openalex.org/W3124216180","https://openalex.org/W3162534564","https://openalex.org/W3162999565","https://openalex.org/W3178592608","https://openalex.org/W3197762568","https://openalex.org/W3198575250","https://openalex.org/W3215562486","https://openalex.org/W4226419874","https://openalex.org/W4294620492","https://openalex.org/W4295308317","https://openalex.org/W4309416467"],"related_works":["https://openalex.org/W2953234277","https://openalex.org/W2626256601","https://openalex.org/W147410782","https://openalex.org/W2900413183","https://openalex.org/W4390975304","https://openalex.org/W3022252430","https://openalex.org/W4287804464","https://openalex.org/W3103989898","https://openalex.org/W3211292372","https://openalex.org/W803346624"],"abstract_inverted_index":{"Target":[0],"sound":[1,9,40],"detection":[2],"(TSD)":[3],"aims":[4],"to":[5,23,36,65,95,111,127,149],"detect":[6,37],"the":[7,15,29,38,42,45,97,102,115,120,124,129,137,169],"target":[8,39,132],"from":[10,28,41],"a":[11,20,25,84,143],"mixture":[12,43,116],"audio":[13,117],"given":[14],"reference":[16,30,53,61,103],"information.Previous":[17],"methods":[18],"use":[19,34],"conditional":[21],"network":[22,46,88,98],"extract":[24],"sounddiscriminative":[26],"embedding":[27,108],"audio,":[31],"and":[32,59,63,86,122,135,165],"then":[33],"it":[35],"audio.However,":[44],"performs":[47],"much":[48],"differently":[49],"when":[50],"using":[51],"different":[52],"audios":[54],"(e.g.performs":[55],"poorly":[56],"for":[57,69,90],"noisy":[58,140],"shortduration":[60],"audios),":[62],"tends":[64],"make":[66,96],"wrong":[67],"decisions":[68],"transient":[70],"events":[71],"(i.e.shorter":[72],"than":[73],"1":[74],"second).To":[75],"overcome":[76],"these":[77],"problems,":[78],"in":[79,93],"this":[80],"paper,":[81],"we":[82,105,157],"present":[83],"reference-aware":[85],"duration-robust":[87,144],"(RaDur)":[89],"TSD.More":[91],"specifically,":[92],"order":[94],"more":[99],"aware":[100],"of":[101,131,139,171],"information,":[104],"propose":[106],"an":[107],"enhancement":[109],"module":[110],"take":[112],"into":[113],"account":[114],"while":[118],"generating":[119],"embedding,":[121],"apply":[123],"attention":[125],"pooling":[126],"enhance":[128],"features":[130,138],"sound-related":[133],"frames":[134],"weaken":[136],"frames.In":[141],"addition,":[142],"focal":[145],"loss":[146],"is":[147],"proposed":[148],"help":[150],"model":[151],"different-duration":[152],"events.To":[153],"evaluate":[154],"our":[155,172],"method,":[156],"build":[158],"two":[159],"TSD":[160],"datasets":[161],"based":[162],"on":[163],"UrbanSound":[164],"Audioset.Extensive":[166],"experiments":[167],"show":[168],"effectiveness":[170],"methods.":[173]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
