{"id":"https://openalex.org/W4226135414","doi":"https://doi.org/10.1109/asru51503.2021.9687967","title":"Multi-Task Learning with Cross Attention for Keyword Spotting","display_name":"Multi-Task Learning with Cross Attention for Keyword Spotting","publication_year":2021,"publication_date":"2021-12-13","ids":{"openalex":"https://openalex.org/W4226135414","doi":"https://doi.org/10.1109/asru51503.2021.9687967"},"language":"en","primary_location":{"id":"doi:10.1109/asru51503.2021.9687967","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9687967","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5005549030","display_name":"Takuya Higuchil","orcid":null},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Takuya Higuchil","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014436016","display_name":"Anmol Gupta","orcid":"https://orcid.org/0000-0003-0159-8780"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Anmol Gupta","raw_affiliation_strings":["The University of Hong Kong,Department of Computer Science"],"affiliations":[{"raw_affiliation_string":"The University of Hong Kong,Department of Computer Science","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5110720542","display_name":"Chandra Dhir","orcid":null},"institutions":[{"id":"https://openalex.org/I4210107260","display_name":"Apple (United Kingdom)","ror":"https://ror.org/01vpeym60","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210107260"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Chandra Dhir","raw_affiliation_strings":["Apple"],"affiliations":[{"raw_affiliation_string":"Apple","institution_ids":["https://openalex.org/I4210107260"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5005549030"],"corresponding_institution_ids":["https://openalex.org/I4210107260"],"apc_list":null,"apc_paid":null,"fwci":0.8796,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.78538044,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"571","last_page":"578"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8494805693626404},{"id":"https://openalex.org/keywords/keyword-spotting","display_name":"Keyword spotting","score":0.8417513370513916},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6581733822822571},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5478888750076294},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.545341968536377},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.48005566000938416},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.4655137062072754},{"id":"https://openalex.org/keywords/phrase","display_name":"Phrase","score":0.4516028165817261},{"id":"https://openalex.org/keywords/multi-task-learning","display_name":"Multi-task learning","score":0.43725812435150146},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3291815519332886}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8494805693626404},{"id":"https://openalex.org/C2781213101","wikidata":"https://www.wikidata.org/wiki/Q6398558","display_name":"Keyword spotting","level":2,"score":0.8417513370513916},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6581733822822571},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5478888750076294},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.545341968536377},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.48005566000938416},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.4655137062072754},{"id":"https://openalex.org/C2776224158","wikidata":"https://www.wikidata.org/wiki/Q187931","display_name":"Phrase","level":2,"score":0.4516028165817261},{"id":"https://openalex.org/C28006648","wikidata":"https://www.wikidata.org/wiki/Q6934509","display_name":"Multi-task learning","level":3,"score":0.43725812435150146},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3291815519332886},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru51503.2021.9687967","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9687967","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.550000011920929}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W1485222997","https://openalex.org/W1522301498","https://openalex.org/W1553469512","https://openalex.org/W2034940213","https://openalex.org/W2407023693","https://openalex.org/W2507319753","https://openalex.org/W2507580616","https://openalex.org/W2510945575","https://openalex.org/W2697044473","https://openalex.org/W2748659049","https://openalex.org/W2775572503","https://openalex.org/W2787752687","https://openalex.org/W2797759721","https://openalex.org/W2889511491","https://openalex.org/W2891722048","https://openalex.org/W2953219395","https://openalex.org/W2962707338","https://openalex.org/W2963977978","https://openalex.org/W2972818416","https://openalex.org/W3001493895","https://openalex.org/W3007328579","https://openalex.org/W3095212293","https://openalex.org/W3095694146","https://openalex.org/W3096831963","https://openalex.org/W3097330909","https://openalex.org/W3161425572","https://openalex.org/W4385245566","https://openalex.org/W6631190155","https://openalex.org/W6747003534","https://openalex.org/W6780226713","https://openalex.org/W6784576758","https://openalex.org/W6843142645"],"related_works":["https://openalex.org/W2114097550","https://openalex.org/W4385352507","https://openalex.org/W2918559346","https://openalex.org/W4286904253","https://openalex.org/W84309476","https://openalex.org/W2386245264","https://openalex.org/W2388033618","https://openalex.org/W2378448517","https://openalex.org/W2017737780","https://openalex.org/W4287993417"],"abstract_inverted_index":{"Keyword":[0],"spotting":[1],"(KWS)":[2],"is":[3,43,82],"an":[4,76,79],"important":[5],"technique":[6],"for":[7,28,37,87,92,102,166],"speech":[8,39],"applications,":[9],"which":[10],"enables":[11],"users":[12],"to":[13,63,65,161,191],"activate":[14],"devices":[15],"by":[16,148],"speaking":[17],"a":[18,22,31,44,115,145,157,163,181,200],"keyword":[19,103],"phrase.":[20],"Although":[21],"phoneme":[23,93],"classifier":[24],"can":[25],"be":[26],"used":[27],"KWS,":[29],"exploiting":[30],"large":[32],"amount":[33],"of":[34,78,134],"transcribed":[35],"data":[36,99],"automatic":[38],"recognition":[40],"(ASR),":[41],"there":[42],"mismatch":[45],"between":[46,152],"the":[47,53,88,97,107,120,125,131,135,138,153,167,177,186,192],"training":[48,71],"criterion":[49],"(phoneme":[50],"recognition)":[51],"and":[52,69,100,156,199],"target":[54],"task":[55],"(KWS).":[56],"Recently,":[57],"multi-task":[58,121,127,194],"learning":[59,122,128,195],"has":[60],"been":[61],"applied":[62],"KWS":[64,70,108,168,173],"exploit":[66],"both":[67],"ASR":[68,98],"data.":[72,109],"In":[73,110],"this":[74,111],"approach,":[75],"output":[77,136],"acoustic":[80],"model":[81],"split":[83,133,197],"into":[84],"two":[85,89],"branches":[86,198],"tasks,":[90],"one":[91,101],"transcription":[94],"trained":[95,105],"with":[96,106,130,196],"classification":[104],"paper,":[112],"we":[113],"introduce":[114],"cross":[116,139,150],"attention":[117,140,151],"decoder":[118,141],"in":[119,185],"framework.":[123],"Unlike":[124],"conventional":[126,193],"approach":[129,179],"simple":[132],"layer,":[137],"summarizes":[142],"information":[143],"from":[144],"phonetic":[146],"encoder":[147,154],"performing":[149],"outputs":[155],"trainable":[158],"query":[159],"sequence":[160],"predict":[162],"confidence":[164],"score":[165],"task.":[169],"Experimental":[170],"results":[171],"on":[172],"tasks":[174],"show":[175],"that":[176],"proposed":[178],"achieves":[180],"12%":[182],"relative":[183],"reduction":[184],"false":[187],"reject":[188],"ratios":[189],"compared":[190],"bi-directional":[201],"long":[202],"short-team":[203],"memory":[204],"decoder.":[205]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
