{"id":"https://openalex.org/W4402111441","doi":"https://doi.org/10.21437/interspeech.2024-1342","title":"Language-Universal Speech Attributes Modeling for Zero-Shot Multilingual Spoken Keyword Recognition","display_name":"Language-Universal Speech Attributes Modeling for Zero-Shot Multilingual Spoken Keyword Recognition","publication_year":2024,"publication_date":"2024-09-01","ids":{"openalex":"https://openalex.org/W4402111441","doi":"https://doi.org/10.21437/interspeech.2024-1342"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2024-1342","is_oa":false,"landing_page_url":"http://dx.doi.org/10.21437/interspeech.2024-1342","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2024","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5000242115","display_name":"Hao Yen","orcid":"https://orcid.org/0000-0001-8897-4368"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hao Yen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058318424","display_name":"Pin-Jui Ku","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pin-Jui Ku","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079659476","display_name":"Sabato Marco Siniscalchi","orcid":"https://orcid.org/0000-0002-0770-0507"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sabato Marco Siniscalchi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5066868860","display_name":"Chin\u2010Hui Lee","orcid":"https://orcid.org/0000-0002-1892-2551"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chin-Hui Lee","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5000242115"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.12163965,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"342","last_page":"346"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8062000274658203,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8062000274658203,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.8003000020980835,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8042041659355164},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.6507778763771057},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6432507038116455},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5807851552963257},{"id":"https://openalex.org/keywords/spoken-language","display_name":"Spoken language","score":0.452729195356369},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4383021891117096},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.20708128809928894}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8042041659355164},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.6507778763771057},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6432507038116455},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5807851552963257},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.452729195356369},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4383021891117096},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.20708128809928894},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.21437/interspeech.2024-1342","is_oa":false,"landing_page_url":"http://dx.doi.org/10.21437/interspeech.2024-1342","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2024","raw_type":"proceedings-article"},{"id":"pmh:oai:iris.unipa.it:10447/670044","is_oa":false,"landing_page_url":"https://hdl.handle.net/10447/670044","pdf_url":null,"source":{"id":"https://openalex.org/S4306401065","display_name":"Nova Science Publishers (Nova Science Publishers, Inc.)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"info:eu-repo/semantics/bookPart"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.5299999713897705,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3013650182","https://openalex.org/W2989283631","https://openalex.org/W4249605382","https://openalex.org/W4313491656","https://openalex.org/W3279617","https://openalex.org/W1991183963","https://openalex.org/W2053087750","https://openalex.org/W2146390824","https://openalex.org/W2250701745","https://openalex.org/W3204019825"],"abstract_inverted_index":{"We":[0],"propose":[1],"a":[2,16,22,43,63],"novel":[3],"language-universal":[4],"approach":[5],"to":[6,36,47,75],"end-to-end":[7],"automatic":[8],"spoken":[9,60],"keyword":[10],"recognition":[11],"(SKR)":[12],"leveraging":[13],"upon":[14],"(i)":[15],"self-supervised":[17],"pre-trained":[18],"model,":[19],"and":[20,29,100,111,114],"(ii)":[21],"set":[23],"of":[24,31,57,83],"universal":[25],"speech":[26,39],"attributes":[27,58],"(manner":[28],"place":[30],"articulation).Specifically,":[32],"Wav2Vec2.0":[33],"is":[34],"used":[35],"generate":[37],"robust":[38],"representations,":[40],"followed":[41],"by":[42],"linear":[44],"output":[45],"layer":[46],"produce":[48],"attribute":[49],"sequences.A":[50],"non-trainable":[51],"pronunciation":[52],"model":[53],"then":[54],"maps":[55],"sequences":[56],"into":[59],"keywords":[61],"in":[62,79,108,121],"multilingual":[64],"setting.Experiments":[65],"on":[66],"the":[67,89],"Multilingual":[68],"Spoken":[69],"Words":[70],"Corpus":[71],"show":[72],"comparable":[73],"performances":[74],"character-and":[76,94],"phoneme-based":[77,95],"SKR":[78,96],"seen":[80,109],"languages.The":[81],"inclusion":[82],"domain":[84],"adversarial":[85],"training":[86],"(DAT)":[87],"improves":[88],"proposed":[90],"framework,":[91],"outperforming":[92],"both":[93],"approaches":[97],"with":[98],"13.73%":[99],"17.22%":[101],"relative":[102],"word":[103],"error":[104],"rate":[105],"(WER)":[106],"reduction":[107,117],"languages,":[110],"achieves":[112],"32.14%":[113],"19.92%":[115],"WER":[116],"for":[118],"unseen":[119],"languages":[120],"zero-shot":[122],"settings.":[123]},"counts_by_year":[],"updated_date":"2025-12-21T01:58:51.020947","created_date":"2025-10-10T00:00:00"}
