{"id":"https://openalex.org/W7148393372","doi":"https://doi.org/10.1109/asru65441.2025.11434738","title":"WST: Weakly Supervised Transducer for Automatic Speech Recognition","display_name":"WST: Weakly Supervised Transducer for Automatic Speech Recognition","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148393372","doi":"https://doi.org/10.1109/asru65441.2025.11434738"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434738","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434738","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5072814956","display_name":"Dongji Gao","orcid":"https://orcid.org/0009-0006-8885-3084"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Dongji Gao","raw_affiliation_strings":["Johns Hopkins University"],"affiliations":[{"raw_affiliation_string":"Johns Hopkins University","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056665779","display_name":"Chenda Liao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https://ror.org/05k87vq12","country_code":"GB","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210164937"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Chenda Liao","raw_affiliation_strings":["Microsoft"],"affiliations":[{"raw_affiliation_string":"Microsoft","institution_ids":["https://openalex.org/I4210164937"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022890229","display_name":"Changliang Liu","orcid":"https://orcid.org/0009-0005-8201-0871"},"institutions":[{"id":"https://openalex.org/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https://ror.org/05k87vq12","country_code":"GB","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210164937"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Changliang Liu","raw_affiliation_strings":["Microsoft"],"affiliations":[{"raw_affiliation_string":"Microsoft","institution_ids":["https://openalex.org/I4210164937"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132804747","display_name":"Matthew Wiesner","orcid":null},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Matthew Wiesner","raw_affiliation_strings":["Johns Hopkins University"],"affiliations":[{"raw_affiliation_string":"Johns Hopkins University","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126951883","display_name":"Leibny Paola Garcia","orcid":null},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Leibny Paola Garcia","raw_affiliation_strings":["Johns Hopkins University"],"affiliations":[{"raw_affiliation_string":"Johns Hopkins University","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132826681","display_name":"Daniel Povey","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daniel Povey","raw_affiliation_strings":["Xiaomi"],"affiliations":[{"raw_affiliation_string":"Xiaomi","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014580424","display_name":"Sanjeev Khudanpur","orcid":"https://orcid.org/0000-0001-5976-0897"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sanjeev Khudanpur","raw_affiliation_strings":["Johns Hopkins University"],"affiliations":[{"raw_affiliation_string":"Johns Hopkins University","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103070335","display_name":"Jian Wu","orcid":"https://orcid.org/0000-0003-2606-6422"},"institutions":[{"id":"https://openalex.org/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https://ror.org/05k87vq12","country_code":"GB","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210164937"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Jian Wu","raw_affiliation_strings":["Microsoft"],"affiliations":[{"raw_affiliation_string":"Microsoft","institution_ids":["https://openalex.org/I4210164937"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5072814956"],"corresponding_institution_ids":["https://openalex.org/I145311948"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.87561762,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9280999898910522,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9280999898910522,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.024000000208616257,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.00570000009611249,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6140999794006348},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4325000047683716},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4221000075340271},{"id":"https://openalex.org/keywords/connectionism","display_name":"Connectionism","score":0.4205999970436096},{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.40959998965263367},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.3718000054359436},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.35850000381469727},{"id":"https://openalex.org/keywords/supervised-learning","display_name":"Supervised learning","score":0.3472999930381775}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7849000096321106},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6140999794006348},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5953999757766724},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5318999886512756},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4325000047683716},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4221000075340271},{"id":"https://openalex.org/C8521452","wikidata":"https://www.wikidata.org/wiki/Q203790","display_name":"Connectionism","level":3,"score":0.4205999970436096},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.40959998965263367},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.40790000557899475},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3718000054359436},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.35850000381469727},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.3472999930381775},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.3452000021934509},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.3447999954223633},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3052000105381012},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.29989999532699585},{"id":"https://openalex.org/C56318395","wikidata":"https://www.wikidata.org/wiki/Q215928","display_name":"Transducer","level":2,"score":0.2992999851703644},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.2930999994277954},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.2718999981880188},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2632000148296356},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.25450000166893005}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434738","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434738","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.46513545513153076,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2127141656","https://openalex.org/W2327501763","https://openalex.org/W2514741789","https://openalex.org/W2766219058","https://openalex.org/W3015686596","https://openalex.org/W3095311338","https://openalex.org/W3097777922","https://openalex.org/W3097973766","https://openalex.org/W3211278025","https://openalex.org/W4372267411","https://openalex.org/W4385822648","https://openalex.org/W4386025763","https://openalex.org/W4391021555","https://openalex.org/W4391021675","https://openalex.org/W4406461721"],"related_works":[],"abstract_inverted_index":{"The":[0,122],"Recurrent":[1],"Neural":[2],"Network-Transducer":[3],"(RNN-T)":[4],"is":[5],"widely":[6],"adopted":[7],"in":[8,54,118],"end-to-end":[9],"(E2E)":[10],"automatic":[11],"speech":[12],"recognition":[13],"(ASR)":[14],"tasks":[15],"but":[16],"depends":[17],"heavily":[18],"on":[19,68],"large-scale,":[20],"high-quality":[21],"annotated":[22],"data,":[23],"which":[24,43],"are":[25],"often":[26],"costly":[27],"and":[28,70,104,114],"difficult":[29],"to":[30,50,86],"obtain.":[31],"To":[32],"mitigate":[33],"this":[34],"reliance,":[35],"we":[36],"propose":[37],"a":[38,45],"Weakly":[39],"Supervised":[40],"Transducer":[41],"(WST),":[42],"integrates":[44],"flexible":[46],"training":[47],"graph":[48],"designed":[49],"robustly":[51],"handle":[52],"errors":[53],"the":[55,111],"transcripts":[56],"without":[57],"requiring":[58],"additional":[59],"confidence":[60],"estimation":[61],"or":[62],"auxiliary":[63],"pre-trained":[64],"models.":[65],"Empirical":[66],"evaluations":[67],"synthetic":[69],"industrial":[71],"datasets":[72],"reveal":[73],"that":[74],"WST":[75,117],"effectively":[76],"maintains":[77],"performance":[78],"even":[79],"with":[80],"transcription":[81],"error":[82],"rates":[83],"of":[84,116],"up":[85],"70%,":[87],"consistently":[88],"outperforming":[89],"existing":[90],"Connectionist":[91],"Temporal":[92,101],"Classification":[93,102,106],"(CTC)-based":[94],"weakly":[95],"supervised":[96],"approaches,":[97],"such":[98],"as":[99],"Bypass":[100],"(BTC)":[103],"Omni-Temporal":[105],"(OTC).":[107],"These":[108],"results":[109],"demonstrate":[110],"practical":[112],"utility":[113],"robustness":[115],"realistic":[119],"ASR":[120],"settings.":[121],"implementation":[123],"will":[124],"be":[125],"publicly":[126],"available.":[127]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
