{"id":"https://openalex.org/W4319780182","doi":"https://doi.org/10.1109/iscslp57327.2022.10037995","title":"Deep Learning Based Audio-Visual Multi-Speaker DOA Estimation Using Permutation-Free Loss Function","display_name":"Deep Learning Based Audio-Visual Multi-Speaker DOA Estimation Using Permutation-Free Loss Function","publication_year":2022,"publication_date":"2022-12-11","ids":{"openalex":"https://openalex.org/W4319780182","doi":"https://doi.org/10.1109/iscslp57327.2022.10037995"},"language":"en","primary_location":{"id":"doi:10.1109/iscslp57327.2022.10037995","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/iscslp57327.2022.10037995","pdf_url":null,"source":{"id":"https://openalex.org/S4363607181","display_name":"2022 13th International Symposium on Chinese Spoken Language Processing (ISCSLP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 13th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100434900","display_name":"Qing Wang","orcid":"https://orcid.org/0000-0003-3843-3920"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Qing Wang","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100442517","display_name":"Hang Chen","orcid":"https://orcid.org/0009-0008-7585-033X"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hang Chen","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038529708","display_name":"Ya Jiang","orcid":"https://orcid.org/0000-0003-1733-8887"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ya Jiang","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100407584","display_name":"Zhe Wang","orcid":"https://orcid.org/0000-0001-9173-0022"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhe Wang","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100409330","display_name":"Yuyang Wang","orcid":"https://orcid.org/0000-0003-0242-8935"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuyang Wang","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066595711","display_name":"Jun Du","orcid":"https://orcid.org/0000-0002-2387-0389"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Du","raw_affiliation_strings":["University of Science and Technology of China,Hefei,China","University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066868860","display_name":"Chin\u2010Hui Lee","orcid":"https://orcid.org/0000-0002-1892-2551"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chin-Hui Lee","raw_affiliation_strings":["Georgia Institute of Technology,Atlanta,GA,USA","Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology,Atlanta,GA,USA","institution_ids":["https://openalex.org/I130701444"]},{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100434900"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.3677,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.55545024,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"250","last_page":"254"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11447","display_name":"Blind Source Separation Techniques","score":0.9890000224113464,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8286714553833008},{"id":"https://openalex.org/keywords/permutation","display_name":"Permutation (music)","score":0.5956786274909973},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5388822555541992},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5293054580688477},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.5031251311302185},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.4936274290084839},{"id":"https://openalex.org/keywords/transformation","display_name":"Transformation (genetics)","score":0.4405689239501953},{"id":"https://openalex.org/keywords/direction-of-arrival","display_name":"Direction of arrival","score":0.43886107206344604},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.41632890701293945},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.39070218801498413},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.38360071182250977},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.34327274560928345},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.12130960822105408}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8286714553833008},{"id":"https://openalex.org/C21308566","wikidata":"https://www.wikidata.org/wiki/Q7169365","display_name":"Permutation (music)","level":2,"score":0.5956786274909973},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5388822555541992},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5293054580688477},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.5031251311302185},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.4936274290084839},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.4405689239501953},{"id":"https://openalex.org/C172051844","wikidata":"https://www.wikidata.org/wiki/Q5280438","display_name":"Direction of arrival","level":3,"score":0.43886107206344604},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.41632890701293945},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.39070218801498413},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.38360071182250977},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.34327274560928345},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.12130960822105408},{"id":"https://openalex.org/C21822782","wikidata":"https://www.wikidata.org/wiki/Q131214","display_name":"Antenna (radio)","level":2,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iscslp57327.2022.10037995","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/iscslp57327.2022.10037995","pdf_url":null,"source":{"id":"https://openalex.org/S4363607181","display_name":"2022 13th International Symposium on Chinese Spoken Language Processing (ISCSLP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 13th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1635512741","https://openalex.org/W2033819227","https://openalex.org/W2046317813","https://openalex.org/W2085365248","https://openalex.org/W2104422351","https://openalex.org/W2113638573","https://openalex.org/W2128131274","https://openalex.org/W2129821199","https://openalex.org/W2130357996","https://openalex.org/W2132605602","https://openalex.org/W2194775991","https://openalex.org/W2222512263","https://openalex.org/W2466975593","https://openalex.org/W2570465105","https://openalex.org/W2603203130","https://openalex.org/W2763188033","https://openalex.org/W2772736377","https://openalex.org/W2897361856","https://openalex.org/W2917254586","https://openalex.org/W2940285530","https://openalex.org/W2963680395","https://openalex.org/W2964109005","https://openalex.org/W2964342924","https://openalex.org/W3016011581","https://openalex.org/W3105684258","https://openalex.org/W3132182240","https://openalex.org/W3161541317","https://openalex.org/W3162475350","https://openalex.org/W3168662520","https://openalex.org/W3172472082","https://openalex.org/W3177241885","https://openalex.org/W3213716738","https://openalex.org/W4224933780","https://openalex.org/W4324116353"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W2162158162","https://openalex.org/W4247736853","https://openalex.org/W1493012537","https://openalex.org/W2175373321","https://openalex.org/W2125642021","https://openalex.org/W4310979479","https://openalex.org/W2696990509","https://openalex.org/W1999004162","https://openalex.org/W1521049138"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3,51],"propose":[4,52],"a":[5,28,53,105,155],"deep":[6],"learning":[7],"based":[8],"multi-speaker":[9,98,116],"direction":[10],"of":[11,63,108,125],"arrival":[12],"(DOA)":[13],"estimation":[14,100,147,152],"with":[15,68,95],"audio":[16,39],"and":[17,40,77,138],"visual":[18,41],"signals":[19,42],"by":[20,72,154],"using":[21],"permutation-free":[22],"loss":[23],"function.":[24],"We":[25],"first":[26],"collect":[27],"data":[29,71,137,140],"set":[30],"for":[31,65],"multi-modal":[32],"sound":[33],"source":[34],"localization":[35],"(SSL)":[36],"where":[37],"both":[38,135],"are":[43,128],"recorded":[44],"in":[45,115],"real-life":[46],"home":[47],"TV":[48],"scenarios.":[49],"Then":[50],"novel":[54],"spatial":[55,87],"annotation":[56],"method":[57],"to":[58,81],"produce":[59],"the":[60,69,82,123,143],"ground":[61],"truth":[62],"DOA":[64,99,146,151],"each":[66,126],"speaker":[67,110,127],"video":[70],"transformation":[73],"between":[74],"camera":[75,84],"coordinate":[76,79],"pixel":[78],"according":[80],"pin-hole":[83],"model.":[85],"With":[86],"location":[88],"information":[89],"served":[90],"as":[91,104,130],"another":[92],"input":[93],"along":[94],"acoustic":[96],"feature,":[97],"could":[101],"be":[102,120],"solved":[103],"classification":[106],"task":[107],"active":[109],"detection.":[111],"Label":[112],"permutation":[113],"problem":[114],"related":[117],"tasks":[118],"will":[119],"addressed":[121],"since":[122],"locations":[124],"used":[129],"input.":[131],"Experiments":[132],"conducted":[133],"on":[134],"simulated":[136],"real":[139],"show":[141],"that":[142],"proposed":[144],"audio-visual":[145],"model":[148,153],"outperforms":[149],"audio-only":[150],"large":[156],"margin.":[157]},"counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":1}],"updated_date":"2025-12-24T23:09:58.560324","created_date":"2025-10-10T00:00:00"}
