{"id":"https://openalex.org/W4400975497","doi":"https://doi.org/10.1109/taslp.2024.3426309","title":"Learning Multi-Dimensional Speaker Localization: Axis Partitioning, Unbiased Label Distribution, and Data Augmentation","display_name":"Learning Multi-Dimensional Speaker Localization: Axis Partitioning, Unbiased Label Distribution, and Data Augmentation","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4400975497","doi":"https://doi.org/10.1109/taslp.2024.3426309"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3426309","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3426309","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5032679393","display_name":"Linfeng Feng","orcid":"https://orcid.org/0009-0008-7908-779X"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Linfeng Feng","raw_affiliation_strings":["School of Marine Science and Technology, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"School of Marine Science and Technology, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000060623","display_name":"Yijun Gong","orcid":"https://orcid.org/0009-0006-3412-971X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yijun Gong","raw_affiliation_strings":["Xi&#x0027;an Research Institute of Navigation Technology (CETC20), Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"Xi&#x0027;an Research Institute of Navigation Technology (CETC20), Xi&#x0027;an, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100382312","display_name":"Zhi Liu","orcid":"https://orcid.org/0000-0002-8428-1131"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhi Liu","raw_affiliation_strings":["Shenzhen Huangli Techonogies Company Ltd., Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen Huangli Techonogies Company Ltd., Shenzhen, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100450091","display_name":"Xiao-Lei Zhang","orcid":"https://orcid.org/0000-0001-7694-193X"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiao-Lei Zhang","raw_affiliation_strings":["School of Marine Science and Technology, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"School of Marine Science and Technology, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5106943753","display_name":"Xuelong Li","orcid":"https://orcid.org/0000-0003-2924-946X"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]},{"id":"https://openalex.org/I4387153335","display_name":"China Telecom","ror":"https://ror.org/05p67dv18","country_code":null,"type":"company","lineage":["https://openalex.org/I4387153335"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuelong Li","raw_affiliation_strings":["Institute of Artificial Intelligence (TeleAI), China Telecom Corporation Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Artificial Intelligence (TeleAI), China Telecom Corporation Ltd., Beijing, China","institution_ids":["https://openalex.org/I4210136246","https://openalex.org/I4387153335"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5032679393"],"corresponding_institution_ids":["https://openalex.org/I17145004"],"apc_list":null,"apc_paid":null,"fwci":0.7009,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.66995597,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":"32","issue":null,"first_page":"4013","last_page":"4025"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.614955723285675},{"id":"https://openalex.org/keywords/distribution","display_name":"Distribution (mathematics)","score":0.5112435817718506},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.42083853483200073},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4167175889015198},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4020335078239441},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.3047918975353241}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.614955723285675},{"id":"https://openalex.org/C110121322","wikidata":"https://www.wikidata.org/wiki/Q865811","display_name":"Distribution (mathematics)","level":2,"score":0.5112435817718506},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.42083853483200073},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4167175889015198},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4020335078239441},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3047918975353241},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2024.3426309","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3426309","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5057566376","display_name":null,"funder_award_id":"62176211","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":55,"referenced_works":["https://openalex.org/W71465238","https://openalex.org/W126779258","https://openalex.org/W1494198834","https://openalex.org/W2046317813","https://openalex.org/W2095705004","https://openalex.org/W2101658087","https://openalex.org/W2108581205","https://openalex.org/W2113638573","https://openalex.org/W2120350100","https://openalex.org/W2460742184","https://openalex.org/W2551990143","https://openalex.org/W2763188033","https://openalex.org/W2772736377","https://openalex.org/W2776323726","https://openalex.org/W2885219692","https://openalex.org/W2889426390","https://openalex.org/W2896850928","https://openalex.org/W2945339796","https://openalex.org/W2948091552","https://openalex.org/W2972725301","https://openalex.org/W2975836787","https://openalex.org/W3104947433","https://openalex.org/W3105684258","https://openalex.org/W3106603336","https://openalex.org/W3132229649","https://openalex.org/W3132830522","https://openalex.org/W3162358523","https://openalex.org/W3182358018","https://openalex.org/W3196393301","https://openalex.org/W3197097128","https://openalex.org/W3208740877","https://openalex.org/W4224918929","https://openalex.org/W4283642889","https://openalex.org/W4296068430","https://openalex.org/W4307007543","https://openalex.org/W4312096470","https://openalex.org/W4372260491","https://openalex.org/W4375868867","https://openalex.org/W4375869206","https://openalex.org/W4375869479","https://openalex.org/W4378895563","https://openalex.org/W4383468850","https://openalex.org/W4393395473","https://openalex.org/W4395471098","https://openalex.org/W4402083555","https://openalex.org/W4405359008","https://openalex.org/W6605165268","https://openalex.org/W6674330103","https://openalex.org/W6757817989","https://openalex.org/W6792180040","https://openalex.org/W6803179888","https://openalex.org/W6846810392","https://openalex.org/W6854211159","https://openalex.org/W6857466839","https://openalex.org/W6889901531"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W1979597421","https://openalex.org/W2007980826","https://openalex.org/W2061531152","https://openalex.org/W3002753104","https://openalex.org/W2077600819","https://openalex.org/W2142036596","https://openalex.org/W2072657027","https://openalex.org/W2033914206","https://openalex.org/W2042327336"],"abstract_inverted_index":{"Multi-dimensional":[0],"speaker":[1],"localization":[2,129],"(SL)":[3],"aims":[4],"to":[5,74,85,135,159,182],"estimate":[6],"the":[7,21,34,46,53,76,80,86,92,103,111,123,127,137,141,145,201,205],"two-":[8],"or":[9],"three-dimensional":[10],"locations":[11],"of":[12,44,72,82,88,140,167],"speakers.":[13],"A":[14],"recent":[15],"advancement":[16],"in":[17,144],"multi-dimensional":[18,98],"SL":[19,35,99],"is":[20],"end-to-end":[22],"deep":[23],"neural":[24],"networks":[25],"(DNNs)":[26],"with":[27],"ad-hoc":[28],"microphone":[29],"arrays.":[30],"This":[31],"method":[32],"transforms":[33],"problem":[36,43],"into":[37,115],"a":[38,42,69,133,151,165],"classification":[39,54],"problem,":[40,130],"i.e.":[41],"identifying":[45],"grids":[47,73,83],"where":[48],"speakers":[49],"are":[50,171],"located.":[51],"However,":[52,78],"formulation":[55],"has":[56,102],"two":[57],"closely":[58],"connected":[59],"weaknesses.":[60],"Firstly,":[61],"this":[62],"approach":[63],"introduces":[64],"quantization":[65,162],"error,":[66],"which":[67,101,120],"needs":[68],"large":[70],"number":[71,81],"mitigate":[75],"error.":[77],"increasing":[79],"leads":[84],"curse":[87],"dimensionality.":[89],"To":[90],"address":[91],"problems,":[93],"we":[94,109,131,149],"propose":[95],"an":[96],"efficient":[97],"algorithm,":[100],"following":[104],"three":[105],"novel":[106],"contributions.":[107],"First,":[108],"decouple":[110],"high-dimensional":[112],"grid":[113],"partitioning":[114,143],"<italic":[116,153],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[117,154],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">axis":[118],"partitioning</i>,":[119],"substantially":[121],"mitigates":[122],"curse-of-dimensionality.":[124],"Particularly,":[125],"for":[126],"multi-speaker":[128],"employ":[132],"separator":[134],"circumvent":[136],"permutation":[138],"ambiguity":[139],"axis":[142],"inference":[146],"stage.":[147],"Second,":[148],"introduce":[150],"comprehensive":[152],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">unbiased":[155],"label":[156],"distribution</i>":[157],"scheme":[158],"further":[160],"eliminate":[161],"errors.":[163],"Finally,":[164],"set":[166],"data":[168],"augmentation":[169],"techniques":[170],"proposed,":[172],"including":[173],"coordinate":[174],"transformation,":[175],"stochastic":[176],"node":[177],"selection,":[178],"and":[179,185,197,200],"mixed":[180],"training,":[181],"alleviate":[183],"overfitting":[184],"sample":[186],"imbalance":[187],"problems.":[188],"The":[189],"proposed":[190],"methods":[191],"were":[192],"evaluated":[193],"on":[194],"both":[195],"simulated":[196],"real-world":[198],"data,":[199],"experimental":[202],"results":[203],"confirm":[204],"effectiveness.":[206]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
