{"id":"https://openalex.org/W6891937798","doi":"https://doi.org/10.48550/arxiv.2508.04333","title":"Binaural Sound Event Localization and Detection Neural Network based on HRTF Localization Cues for Humanoid Robots","display_name":"Binaural Sound Event Localization and Detection Neural Network based on HRTF Localization Cues for Humanoid Robots","publication_year":2025,"publication_date":"2025-08-06","ids":{"openalex":"https://openalex.org/W6891937798","doi":"https://doi.org/10.48550/arxiv.2508.04333"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2508.04333","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2508.04333","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2508.04333","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Lee, Gyeong-Tae","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lee, Gyeong-Tae","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.2838999927043915,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.2838999927043915,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12740","display_name":"Gait Recognition and Analysis","score":0.23639999330043793,"subfield":{"id":"https://openalex.org/subfields/2204","display_name":"Biomedical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.09480000287294388,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/binaural-recording","display_name":"Binaural recording","score":0.8848999738693237},{"id":"https://openalex.org/keywords/sound-localization","display_name":"Sound localization","score":0.593999981880188},{"id":"https://openalex.org/keywords/humanoid-robot","display_name":"Humanoid robot","score":0.5928000211715698},{"id":"https://openalex.org/keywords/interaural-time-difference","display_name":"Interaural time difference","score":0.5784000158309937},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5299000144004822},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.5296000242233276},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4941999912261963},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.46889999508857727},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4074000120162964}],"concepts":[{"id":"https://openalex.org/C201247586","wikidata":"https://www.wikidata.org/wiki/Q5612967","display_name":"Binaural recording","level":2,"score":0.8848999738693237},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6869000196456909},{"id":"https://openalex.org/C68236139","wikidata":"https://www.wikidata.org/wiki/Q765652","display_name":"Sound localization","level":2,"score":0.593999981880188},{"id":"https://openalex.org/C60692881","wikidata":"https://www.wikidata.org/wiki/Q584529","display_name":"Humanoid robot","level":3,"score":0.5928000211715698},{"id":"https://openalex.org/C103364767","wikidata":"https://www.wikidata.org/wiki/Q1807562","display_name":"Interaural time difference","level":3,"score":0.5784000158309937},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5299000144004822},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.5296000242233276},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4977000057697296},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4941999912261963},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.47859999537467957},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.46889999508857727},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4074000120162964},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.40380001068115234},{"id":"https://openalex.org/C82600853","wikidata":"https://www.wikidata.org/wiki/Q2115271","display_name":"Precedence effect","level":3,"score":0.39899998903274536},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.36570000648498535},{"id":"https://openalex.org/C151382886","wikidata":"https://www.wikidata.org/wiki/Q1432854","display_name":"Head-related transfer function","level":3,"score":0.36469998955726624},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.34630000591278076},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.322299987077713},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3091999888420105},{"id":"https://openalex.org/C81299745","wikidata":"https://www.wikidata.org/wiki/Q334269","display_name":"Transfer function","level":2,"score":0.29600000381469727},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.2809999883174896},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.27059999108314514},{"id":"https://openalex.org/C37054046","wikidata":"https://www.wikidata.org/wiki/Q641888","display_name":"Elevation (ballistics)","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.2660999894142151},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.2599000036716461},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.25949999690055847},{"id":"https://openalex.org/C142433447","wikidata":"https://www.wikidata.org/wiki/Q7806653","display_name":"Time\u2013frequency analysis","level":3,"score":0.25850000977516174},{"id":"https://openalex.org/C203718221","wikidata":"https://www.wikidata.org/wiki/Q491713","display_name":"Sound (geography)","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2556000053882599},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.2513999938964844},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2508.04333","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2508.04333","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2508.04333","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2508.04333","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.8248078227043152,"display_name":"Sustainable cities and communities","id":"https://metadata.un.org/sdg/11"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Humanoid":[0],"robots":[1],"require":[2],"simultaneous":[3,136],"sound":[4,29,132],"event":[5,30,133],"type":[6],"and":[7,21,32,45,90,109,138],"direction":[8,128],"estimation":[9,20],"for":[10,98,130,159],"situational":[11],"awareness,":[12],"but":[13],"conventional":[14],"two-channel":[15],"input":[16,54],"struggles":[17],"with":[18,87,181],"elevation":[19,160],"front-back":[22,88],"confusion.":[23],"This":[24],"paper":[25],"proposes":[26],"a":[27],"binaural":[28,53,59,182],"localization":[31,50],"detection":[33,137],"(BiSELD)":[34],"neural":[35],"network":[36,149],"to":[37,123,147],"address":[38],"these":[39],"challenges.":[40],"BiSELDnet":[41],"learns":[42],"time-frequency":[43,60],"patterns":[44],"head-related":[46],"transfer":[47],"function":[48],"(HRTF)":[49],"cues":[51],"from":[52],"features.":[55],"A":[56],"novel":[57],"eight-channel":[58],"feature":[61],"(BTFF)":[62],"is":[63],"introduced,":[64],"comprising":[65],"left/right":[66],"mel-spectrograms,":[67],"V-maps,":[68],"an":[69,78],"interaural":[70,79],"time":[71,125],"difference":[72,81],"(ITD)":[73],"map":[74,83,142],"(below":[75],"1.5":[76],"kHz),":[77],"level":[80],"(ILD)":[82],"(above":[84,95],"5":[85,96],"kHz":[86,97],"asymmetry),":[89],"spectral":[91],"cue":[92],"(SC)":[93],"maps":[94],"elevation).":[99],"The":[100],"effectiveness":[101],"of":[102,127],"BTFF":[103],"was":[104,145],"confirmed":[105],"across":[106],"omnidirectional,":[107],"horizontal,":[108],"median":[110],"planes.":[111],"BiSELDnets,":[112],"particularly":[113],"one":[114],"based":[115],"on":[116,154],"the":[117,155,171],"efficient":[118],"Trinity":[119],"module,":[120],"were":[121],"implemented":[122],"output":[124],"series":[126],"vectors":[129],"each":[131],"class,":[134],"enabling":[135],"localization.":[139],"Vector":[140],"activation":[141],"(VAM)":[143],"visualization":[144],"proposed":[146,172],"analyze":[148],"learning,":[150],"confirming":[151],"BiSELDnet's":[152],"focus":[153],"N1":[156],"notch":[157],"frequency":[158],"estimation.":[161],"Comparative":[162],"evaluations":[163],"under":[164],"urban":[165],"background":[166],"noise":[167],"conditions":[168],"demonstrated":[169],"that":[170],"BiSELD":[173],"model":[174],"significantly":[175],"outperforms":[176],"state-of-the-art":[177],"(SOTA)":[178],"SELD":[179],"models":[180],"input.":[183]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
