{"id":"https://openalex.org/W4392538456","doi":"https://doi.org/10.1145/3648536.3648539","title":"Single-Channel Robot Ego-Speech Filtering during Human-Robot Interaction","display_name":"Single-Channel Robot Ego-Speech Filtering during Human-Robot Interaction","publication_year":2024,"publication_date":"2024-03-09","ids":{"openalex":"https://openalex.org/W4392538456","doi":"https://doi.org/10.1145/3648536.3648539"},"language":"en","primary_location":{"id":"doi:10.1145/3648536.3648539","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3648536.3648539","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3648536.3648539","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 International Symposium on Technological Advances in Human-Robot Interaction","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3648536.3648539","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100387803","display_name":"Yue Li","orcid":"https://orcid.org/0000-0002-5624-7235"},"institutions":[{"id":"https://openalex.org/I865915315","display_name":"Vrije Universiteit Amsterdam","ror":"https://ror.org/008xxew50","country_code":"NL","type":"education","lineage":["https://openalex.org/I865915315"]}],"countries":["NL"],"is_corresponding":true,"raw_author_name":"Yue Li","raw_affiliation_strings":["Social AI, Vrije Universiteit, Netherlands"],"raw_orcid":"https://orcid.org/0000-0002-5624-7235","affiliations":[{"raw_affiliation_string":"Social AI, Vrije Universiteit, Netherlands","institution_ids":["https://openalex.org/I865915315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071518251","display_name":"Koen V. Hindriks","orcid":"https://orcid.org/0000-0002-5707-5236"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koen Hindriks","raw_affiliation_strings":["VU University, Netherlands"],"raw_orcid":"https://orcid.org/0000-0002-5707-5236","affiliations":[{"raw_affiliation_string":"VU University, Netherlands","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5063456893","display_name":"Florian Kunneman","orcid":"https://orcid.org/0000-0002-1932-3200"},"institutions":[{"id":"https://openalex.org/I865915315","display_name":"Vrije Universiteit Amsterdam","ror":"https://ror.org/008xxew50","country_code":"NL","type":"education","lineage":["https://openalex.org/I865915315"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Florian Kunneman","raw_affiliation_strings":["Department for Computer Science, Vrije Universiteit, Netherlands"],"raw_orcid":"https://orcid.org/0000-0002-1932-3200","affiliations":[{"raw_affiliation_string":"Department for Computer Science, Vrije Universiteit, Netherlands","institution_ids":["https://openalex.org/I865915315"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100387803"],"corresponding_institution_ids":["https://openalex.org/I865915315"],"apc_list":null,"apc_paid":null,"fwci":1.3153,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.78253724,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"20","last_page":"28"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.770932674407959},{"id":"https://openalex.org/keywords/reverberation","display_name":"Reverberation","score":0.6944581270217896},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6791473031044006},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.5252437591552734},{"id":"https://openalex.org/keywords/microphone","display_name":"Microphone","score":0.48484838008880615},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.4635464549064636},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.45726144313812256},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.43354955315589905},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.39507901668548584},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.36196714639663696},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.17880025506019592},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.15106400847434998}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.770932674407959},{"id":"https://openalex.org/C95851461","wikidata":"https://www.wikidata.org/wiki/Q468809","display_name":"Reverberation","level":2,"score":0.6944581270217896},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6791473031044006},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.5252437591552734},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.48484838008880615},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.4635464549064636},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.45726144313812256},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.43354955315589905},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39507901668548584},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.36196714639663696},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.17880025506019592},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.15106400847434998},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C68115822","wikidata":"https://www.wikidata.org/wiki/Q1068172","display_name":"Sound pressure","level":2,"score":0.0}],"mesh":[],"locations_count":5,"locations":[{"id":"doi:10.1145/3648536.3648539","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3648536.3648539","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3648536.3648539","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 International Symposium on Technological Advances in Human-Robot Interaction","raw_type":"proceedings-article"},{"id":"pmh:oai:research.vu.nl:openaire/bc5a41aa-dcc5-4702-acd0-36230138d96f","is_oa":true,"landing_page_url":"https://research.vu.nl/en/publications/bc5a41aa-dcc5-4702-acd0-36230138d96f","pdf_url":null,"source":{"id":"https://openalex.org/S4306401107","display_name":"VU Research Portal","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I865915315","host_organization_name":"Vrije Universiteit Amsterdam","host_organization_lineage":["https://openalex.org/I865915315"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Li, Y, Hindriks, K & Kunneman, F 2024, Single-Channel Robot Ego-Speech Filtering during Human-Robot Interaction. in TAHRI '24 : Proceedings of the 2024 International Symposium on Technological Advances in Human-Robot Interaction. Association for Computing Machinery, pp. 20-28, 2024 International Symposium on Technological Advances in Human-Robot Interaction, TAHRI 2024, Boulder, United States, 9/03/24. https://doi.org/10.1145/3648536.3648539","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:arXiv.org:2403.02918","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2403.02918","pdf_url":"https://arxiv.org/pdf/2403.02918","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:dspace.library.uu.nl:1874/482333","is_oa":true,"landing_page_url":"https://dspace.library.uu.nl/handle/1874/482333","pdf_url":null,"source":{"id":"https://openalex.org/S4306401649","display_name":"Utrecht University Repository (Utrecht University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I193662353","host_organization_name":"Utrecht University","host_organization_lineage":["https://openalex.org/I193662353"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Part of book"},{"id":"pmh:oai:research.vu.nl:publications/bc5a41aa-dcc5-4702-acd0-36230138d96f","is_oa":true,"landing_page_url":"https://hdl.handle.net/1871.1/bc5a41aa-dcc5-4702-acd0-36230138d96f","pdf_url":null,"source":{"id":"https://openalex.org/S4306401107","display_name":"VU Research Portal","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I865915315","host_organization_name":"Vrije Universiteit Amsterdam","host_organization_lineage":["https://openalex.org/I865915315"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Li, Y, Hindriks, K & Kunneman, F 2024, Single-Channel Robot Ego-Speech Filtering during Human-Robot Interaction. in TAHRI '24 : Proceedings of the 2024 International Symposium on Technological Advances in Human-Robot Interaction. Association for Computing Machinery, pp. 20-28, 2024 International Symposium on Technological Advances in Human-Robot Interaction, TAHRI 2024, Boulder, United States, 9/03/24. https://doi.org/10.1145/3648536.3648539","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":{"id":"doi:10.1145/3648536.3648539","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3648536.3648539","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3648536.3648539","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 International Symposium on Technological Advances in Human-Robot Interaction","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.6700000166893005,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4392538456.pdf"},"referenced_works_count":25,"referenced_works":["https://openalex.org/W317957491","https://openalex.org/W1494198834","https://openalex.org/W1502692652","https://openalex.org/W1975856388","https://openalex.org/W2087368178","https://openalex.org/W2120353785","https://openalex.org/W2128653836","https://openalex.org/W2152321507","https://openalex.org/W2221409856","https://openalex.org/W2291877678","https://openalex.org/W2460742184","https://openalex.org/W2606977969","https://openalex.org/W2951130829","https://openalex.org/W2964058413","https://openalex.org/W2972568703","https://openalex.org/W2973062255","https://openalex.org/W2974141411","https://openalex.org/W3010658001","https://openalex.org/W3011424113","https://openalex.org/W3015191643","https://openalex.org/W3015623828","https://openalex.org/W3019269619","https://openalex.org/W3112188842","https://openalex.org/W4311000453","https://openalex.org/W4367597591"],"related_works":["https://openalex.org/W1562475690","https://openalex.org/W1488529827","https://openalex.org/W2396048001","https://openalex.org/W2905188205","https://openalex.org/W1559044324","https://openalex.org/W2120771489","https://openalex.org/W2122030153","https://openalex.org/W2051376034","https://openalex.org/W2294333436","https://openalex.org/W2955597484"],"abstract_inverted_index":{"In":[0],"this":[1,14],"paper,":[2],"we":[3,104,172,221],"study":[4],"how":[5,89],"well":[6,90],"human":[7,53,131,236,305,327],"speech":[8,84,117,132,194,237,257,288,309,328],"can":[9,37,54,79,92],"automatically":[10],"be":[11,80,93],"filtered":[12],"when":[13,40,289],"overlaps":[15],"with":[16,142,153,196,310],"the":[17,35,41,52,56,61,68,72,76,96,99,128,136,175,182,192,200,209,235,243,246,251,267,274,286,296,304,320,326],"voice":[18,306],"and":[19,130,145,154,157,239,259,325],"fan":[20,122],"noise":[21,123],"of":[22,75,98,112,115,118,187,245],"a":[23,46,109,113,140,149,158,166,223,311,330],"social":[24,101],"robot,":[25],"Pepper.":[26],"We":[27,231],"ultimately":[28],"aim":[29],"for":[30,206,215,284],"an":[31],"HRI":[32],"scenario":[33],"where":[34,51],"microphone":[36],"remain":[38],"open":[39],"robot":[42,62,102,312],"is":[43,125,203,212,262,271,281,291,313,323],"speaking,":[44],"enabling":[45],"more":[47,204,293],"natural":[48],"turn-taking":[49],"scheme":[50],"interrupt":[55],"robot.":[57],"To":[58,87],"respond":[59],"appropriately,":[60],"would":[63],"need":[64],"to":[65,107,127,165,264],"understand":[66],"what":[67],"interlocutor":[69],"said":[70],"in":[71,95,139,185,226,278,307,315],"overlapping":[73,193,308],"part":[74],"speech,":[77],"which":[78],"accomplished":[81,94],"by":[82,135],"target":[83,297],"extraction":[85],"(TSE).":[86],"investigate":[88],"TSE":[91,170,279],"context":[97],"popular":[100],"Pepper,":[103],"set":[105],"out":[106],"manufacture":[108],"datase":[110],"composed":[111],"mixture":[114],"recorded":[116,134],"Pepper":[119,137],"itself,":[120],"its":[121],"(which":[124],"close":[126],"microphones),":[129],"as":[133],"microphone,":[138],"room":[141,321],"low":[143,197,324],"reverberation":[144,322],"high":[146,331,334],"reverberation.":[147,207],"Comparing":[148],"signal":[150,176,252],"processing":[151,177,253],"approach,":[152],"without":[155,179],"post-filtering,":[156],"convolutional":[159],"recurrent":[160],"neural":[161,268],"network":[162,269],"(CRNN)":[163],"approach":[164,178,202],"state-of-the-art":[167],"speaker":[168],"identification-based":[169],"model,":[171],"found":[173],"that":[174,302,319],"post-filtering":[180],"yielded":[181],"best":[183,210],"performance":[184,211,227,244],"terms":[186],"Word":[188],"Error":[189],"Rate":[190],"on":[191,256],"signals":[195],"reverberation,":[198,265],"while":[199,220,266],"CRNN":[201],"robust":[205],"Moreover,":[208],"not":[213,282],"sufficient":[214],"consistent":[216],"comprehension":[217],"after":[218],"filtering,":[219],"see":[222],"large":[224],"diversity":[225],"across":[228],"our":[229],"dataset.":[230],"conclude":[232],"that,":[233],"first,":[234],"volume":[238,332],"pitch":[240],"strongly":[241],"affect":[242],"proposed":[247],"method\u2019s":[248],"results;":[249],"second,":[250],"method":[254,270],"based":[255],"masking":[258],"spectral":[260],"subtraction":[261],"keen":[263],"robust;":[272],"third,":[273],"batch":[275],"normalization":[276],"layer":[277],"models":[280],"useful":[283],"filtering":[285],"interference":[287],"it":[290],"significantly":[292],"powerful":[294],"than":[295],"speech.":[298],"These":[299],"results":[300],"show":[301],"estimating":[303],"possible":[314],"real-life":[316],"application,":[317],"provided":[318],"has":[329],"or":[333],"pitch.":[335]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
