{"id":"https://openalex.org/W4372346399","doi":"https://doi.org/10.1109/icassp49357.2023.10095400","title":"Multilingual Query-by-Example Keyword Spotting with Metric Learning and Phoneme-to-Embedding Mapping","display_name":"Multilingual Query-by-Example Keyword Spotting with Metric Learning and Phoneme-to-Embedding Mapping","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372346399","doi":"https://doi.org/10.1109/icassp49357.2023.10095400"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095400","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095400","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5039602141","display_name":"Paul M. Reuter","orcid":null},"institutions":[{"id":"https://openalex.org/I4210138578","display_name":"Fraunhofer Institute for Digital Media Technology","ror":"https://ror.org/04gp0yb49","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210138578","https://openalex.org/I4923324"]},{"id":"https://openalex.org/I4210144375","display_name":"Hearing4all","ror":"https://ror.org/0393vzh87","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210144375"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Paul M. Reuter","raw_affiliation_strings":["Carl von Ossietzky University,Communication Acoustics and Cluster of Excellence Hearing4all,Oldenburg,Germany","Communication Acoustics and Cluster of Excellence Hearing4all, Carl von Ossietzky University, Oldenburg, Germany","Division Hearing, Speech and Audio Technology, Fraunhofer Institute for Digital Media Technology IDMT, Oldenburg, Germany"],"affiliations":[{"raw_affiliation_string":"Carl von Ossietzky University,Communication Acoustics and Cluster of Excellence Hearing4all,Oldenburg,Germany","institution_ids":["https://openalex.org/I4210144375"]},{"raw_affiliation_string":"Communication Acoustics and Cluster of Excellence Hearing4all, Carl von Ossietzky University, Oldenburg, Germany","institution_ids":["https://openalex.org/I4210144375"]},{"raw_affiliation_string":"Division Hearing, Speech and Audio Technology, Fraunhofer Institute for Digital Media Technology IDMT, Oldenburg, Germany","institution_ids":["https://openalex.org/I4210138578"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057557331","display_name":"Christian Rollwage","orcid":null},"institutions":[{"id":"https://openalex.org/I4210138578","display_name":"Fraunhofer Institute for Digital Media Technology","ror":"https://ror.org/04gp0yb49","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210138578","https://openalex.org/I4923324"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Christian Rollwage","raw_affiliation_strings":["Fraunhofer Institute for Digital Media Technology IDMT,Division Hearing, Speech and Audio Technology,Oldenburg,Germany","Division Hearing, Speech and Audio Technology, Fraunhofer Institute for Digital Media Technology IDMT, Oldenburg, Germany"],"affiliations":[{"raw_affiliation_string":"Fraunhofer Institute for Digital Media Technology IDMT,Division Hearing, Speech and Audio Technology,Oldenburg,Germany","institution_ids":["https://openalex.org/I4210138578"]},{"raw_affiliation_string":"Division Hearing, Speech and Audio Technology, Fraunhofer Institute for Digital Media Technology IDMT, Oldenburg, Germany","institution_ids":["https://openalex.org/I4210138578"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5067491941","display_name":"Bernd T. Meyer","orcid":"https://orcid.org/0000-0001-9190-2111"},"institutions":[{"id":"https://openalex.org/I4210144375","display_name":"Hearing4all","ror":"https://ror.org/0393vzh87","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210144375"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Bernd T. Meyer","raw_affiliation_strings":["Carl von Ossietzky University,Communication Acoustics and Cluster of Excellence Hearing4all,Oldenburg,Germany","Communication Acoustics and Cluster of Excellence Hearing4all, Carl von Ossietzky University, Oldenburg, Germany"],"affiliations":[{"raw_affiliation_string":"Carl von Ossietzky University,Communication Acoustics and Cluster of Excellence Hearing4all,Oldenburg,Germany","institution_ids":["https://openalex.org/I4210144375"]},{"raw_affiliation_string":"Communication Acoustics and Cluster of Excellence Hearing4all, Carl von Ossietzky University, Oldenburg, Germany","institution_ids":["https://openalex.org/I4210144375"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5039602141"],"corresponding_institution_ids":["https://openalex.org/I4210138578","https://openalex.org/I4210144375"],"apc_list":null,"apc_paid":null,"fwci":2.6323,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.90704592,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/keyword-spotting","display_name":"Keyword spotting","score":0.9005703926086426},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8706290125846863},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5961803793907166},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.527673065662384},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.5265657901763916},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.5172513127326965},{"id":"https://openalex.org/keywords/spotting","display_name":"Spotting","score":0.49318569898605347},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4894510805606842},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.464999794960022},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.44111761450767517},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.43808993697166443},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.41750895977020264},{"id":"https://openalex.org/keywords/word-embedding","display_name":"Word embedding","score":0.41416120529174805},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3980327844619751}],"concepts":[{"id":"https://openalex.org/C2781213101","wikidata":"https://www.wikidata.org/wiki/Q6398558","display_name":"Keyword spotting","level":2,"score":0.9005703926086426},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8706290125846863},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5961803793907166},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.527673065662384},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.5265657901763916},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.5172513127326965},{"id":"https://openalex.org/C2779506182","wikidata":"https://www.wikidata.org/wiki/Q7580141","display_name":"Spotting","level":2,"score":0.49318569898605347},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4894510805606842},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.464999794960022},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.44111761450767517},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.43808993697166443},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.41750895977020264},{"id":"https://openalex.org/C2777462759","wikidata":"https://www.wikidata.org/wiki/Q18395344","display_name":"Word embedding","level":3,"score":0.41416120529174805},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3980327844619751},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095400","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095400","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:publica.fraunhofer.de:publica/448324","is_oa":false,"landing_page_url":"https://publica.fraunhofer.de/handle/publica/448324","pdf_url":null,"source":{"id":"https://openalex.org/S4306400318","display_name":"Fraunhofer-Publica (Fraunhofer-Gesellschaft)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4923324","host_organization_name":"Fraunhofer-Gesellschaft","host_organization_lineage":["https://openalex.org/I4923324"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"conference paper"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7900000214576721,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320320879","display_name":"Deutsche Forschungsgemeinschaft","ror":"https://ror.org/018mejw64"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W114193738","https://openalex.org/W1496120315","https://openalex.org/W1522301498","https://openalex.org/W2034940213","https://openalex.org/W2056986588","https://openalex.org/W2122797512","https://openalex.org/W2171019095","https://openalex.org/W2194775991","https://openalex.org/W2219249508","https://openalex.org/W2696967604","https://openalex.org/W2747874407","https://openalex.org/W2797583228","https://openalex.org/W2890964092","https://openalex.org/W2963628261","https://openalex.org/W2973226577","https://openalex.org/W3008790380","https://openalex.org/W3013020904","https://openalex.org/W3025581723","https://openalex.org/W3030437843","https://openalex.org/W3034303554","https://openalex.org/W3090254849","https://openalex.org/W3097018422","https://openalex.org/W3119913666","https://openalex.org/W3144247233","https://openalex.org/W3148564648","https://openalex.org/W3162273446","https://openalex.org/W3163582231","https://openalex.org/W3197436729","https://openalex.org/W4206567542","https://openalex.org/W4289243832","https://openalex.org/W6631190155","https://openalex.org/W6688816777","https://openalex.org/W6750665317","https://openalex.org/W6757022092","https://openalex.org/W6771467084","https://openalex.org/W6783527727"],"related_works":["https://openalex.org/W2918559346","https://openalex.org/W3119978414","https://openalex.org/W2114097550","https://openalex.org/W2516975559","https://openalex.org/W2545741539","https://openalex.org/W3206647229","https://openalex.org/W4286904253","https://openalex.org/W2000885660","https://openalex.org/W1969408022","https://openalex.org/W2117995638"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3],"propose":[4],"a":[5,14,23,26,53,70,93,100,126,130],"multilingual":[6,27],"query-by-example":[7],"keyword":[8,28,105,108],"spotting":[9,106],"(KWS)":[10],"system":[11,98],"based":[12],"on":[13,25,110,121],"residual":[15],"neural":[16],"network.":[17],"The":[18],"model":[19,47,83],"is":[20],"trained":[21],"as":[22],"classifier":[24],"dataset":[29,124],"extracted":[30],"from":[31,88],"Common":[32,111],"Voice":[33,112],"sentences":[34],"and":[35,51,63,107],"fine-tuned":[36],"using":[37,92,114],"circle":[38],"loss.":[39],"We":[40,73],"demonstrate":[41],"the":[42,46,76,81,89,122],"generalization":[43],"ability":[44],"of":[45,58,134],"to":[48,69],"new":[49],"languages":[50,67],"report":[52],"mean":[54],"reduction":[55],"in":[56],"EER":[57],"59.2%":[59],"for":[60,65,103],"previously":[61],"seen":[62],"47.9%":[64],"unseen":[66],"compared":[68],"competitive":[71],"baseline.":[72],"show":[74,125],"that":[75],"word":[77],"embeddings":[78],"learned":[79],"by":[80],"KWS":[82],"can":[84],"be":[85],"accurately":[86],"predicted":[87],"phoneme":[90],"sequences":[91],"simple":[94],"LSTM":[95],"model.":[96],"Our":[97],"achieves":[99],"promising":[101],"accuracy":[102],"streaming":[104],"search":[109],"audio":[113],"just":[115],"5":[116],"examples":[117],"per":[118,141],"keyword.":[119],"Experiments":[120],"Hey-Snips":[123],"good":[127],"performance":[128],"with":[129],"false":[131,139],"negative":[132],"rate":[133],"5.4%":[135],"at":[136],"only":[137],"0.1":[138],"alarms":[140],"hour.":[142]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":7}],"updated_date":"2026-03-09T08:58:05.943551","created_date":"2025-10-10T00:00:00"}
