{"id":"https://openalex.org/W4406461912","doi":"https://doi.org/10.1109/slt61566.2024.10832212","title":"CTC-GMM: CTC Guided Modality Matching For Fast and Accurate Streaming Speech Translation","display_name":"CTC-GMM: CTC Guided Modality Matching For Fast and Accurate Streaming Speech Translation","publication_year":2024,"publication_date":"2024-12-02","ids":{"openalex":"https://openalex.org/W4406461912","doi":"https://doi.org/10.1109/slt61566.2024.10832212"},"language":"en","primary_location":{"id":"doi:10.1109/slt61566.2024.10832212","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832212","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100684043","display_name":"Rui Zhao","orcid":"https://orcid.org/0000-0003-2993-2023"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Rui Zhao","raw_affiliation_strings":["Microsoft,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100365056","display_name":"Jinyu Li","orcid":"https://orcid.org/0000-0002-5206-8600"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Microsoft,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080637906","display_name":"Ruchao Fan","orcid":"https://orcid.org/0000-0001-5021-2747"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ruchao Fan","raw_affiliation_strings":["Microsoft,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5108266978","display_name":"Matt Post","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Matt Post","raw_affiliation_strings":["Microsoft,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft,USA","institution_ids":["https://openalex.org/I1290206253"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100684043"],"corresponding_institution_ids":["https://openalex.org/I1290206253"],"apc_list":null,"apc_paid":null,"fwci":0.3626,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.7083691,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1068","last_page":"1075"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7652243375778198},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.5954771637916565},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.5500317215919495},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5393879413604736},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.44784560799598694},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.42095568776130676},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.36331769824028015},{"id":"https://openalex.org/keywords/medicine","display_name":"Medicine","score":0.07566189765930176}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7652243375778198},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.5954771637916565},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.5500317215919495},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5393879413604736},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.44784560799598694},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42095568776130676},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.36331769824028015},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.07566189765930176},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C142724271","wikidata":"https://www.wikidata.org/wiki/Q7208","display_name":"Pathology","level":1,"score":0.0},{"id":"https://openalex.org/C105580179","wikidata":"https://www.wikidata.org/wiki/Q188928","display_name":"Messenger RNA","level":3,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/slt61566.2024.10832212","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832212","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W128638292","https://openalex.org/W2013598660","https://openalex.org/W2127141656","https://openalex.org/W2133564696","https://openalex.org/W2157331557","https://openalex.org/W2605131327","https://openalex.org/W2901607128","https://openalex.org/W2952992734","https://openalex.org/W2964161387","https://openalex.org/W3015194534","https://openalex.org/W3034625919","https://openalex.org/W3037542581","https://openalex.org/W3094667432","https://openalex.org/W3097777922","https://openalex.org/W3105214104","https://openalex.org/W3153583341","https://openalex.org/W3161873870","https://openalex.org/W3162000275","https://openalex.org/W3196509775","https://openalex.org/W3197813307","https://openalex.org/W3205644108","https://openalex.org/W3211278025","https://openalex.org/W4210811812","https://openalex.org/W4224137820","https://openalex.org/W4226120743","https://openalex.org/W4252812408","https://openalex.org/W4283834483","https://openalex.org/W4297727296","https://openalex.org/W4313679638","https://openalex.org/W4319862635","https://openalex.org/W4361990931","https://openalex.org/W4377010126","https://openalex.org/W4385822651","https://openalex.org/W4388017359","https://openalex.org/W4389600306","https://openalex.org/W4391021781","https://openalex.org/W4392904087","https://openalex.org/W4392979802","https://openalex.org/W6679434410","https://openalex.org/W6732953234","https://openalex.org/W6747158283","https://openalex.org/W6768570733","https://openalex.org/W6838929754","https://openalex.org/W6847363464","https://openalex.org/W6848735303","https://openalex.org/W6853188576","https://openalex.org/W6862144568","https://openalex.org/W6898505805"],"related_works":["https://openalex.org/W2385859805","https://openalex.org/W2530972254","https://openalex.org/W2374013449","https://openalex.org/W73545470","https://openalex.org/W2364381299","https://openalex.org/W2374430585","https://openalex.org/W3144423903","https://openalex.org/W2377397762","https://openalex.org/W2793967660","https://openalex.org/W2361654993"],"abstract_inverted_index":{"Models":[0],"for":[1,37],"streaming":[2,73,120],"speech":[3,91],"translation":[4,80,137],"(ST)":[5],"can":[6,135],"achieve":[7],"high":[8],"accuracy":[9,138],"and":[10,26,128,142],"low":[11],"latency":[12],"if":[13],"they\u2019re":[14],"developed":[15],"with":[16,126],"vast":[17],"amounts":[18],"of":[19,50],"paired":[20],"audio":[21],"in":[22,29],"the":[23,30,38,47,72,90,100,114,119,132],"source":[24],"language":[25,40,110],"written":[27],"text":[28,35,82,102,111],"target":[31,39],"language.":[32],"Yet,":[33],"these":[34],"labels":[36,44],"are":[41],"often":[42],"pseudo":[43],"due":[45],"to":[46,88,106,117],"prohibitive":[48],"cost":[49],"manual":[51],"ST":[52,74,121],"data":[53],"labeling.":[54],"In":[55],"this":[56],"paper,":[57],"we":[58],"introduce":[59],"a":[60,94],"methodology":[61],"named":[62],"Connectionist":[63],"Temporal":[64],"Classification":[65],"guided":[66],"modality":[67],"matching":[68],"(CTC-GMM)":[69],"that":[70,98,131],"enhances":[71],"model":[75,122],"by":[76,140,150],"leveraging":[77],"extensive":[78],"machine":[79],"(MT)":[81],"data.":[83],"This":[84],"technique":[85],"employs":[86],"CTC":[87],"compress":[89],"sequence":[92,97],"into":[93],"compact":[95],"embedding":[96],"matches":[99],"corresponding":[101],"sequence,":[103],"allowing":[104],"us":[105],"utilize":[107],"matched":[108],"source-target":[109],"pairs":[112],"from":[113],"MT":[115],"corpora":[116],"refine":[118],"further.":[123],"Our":[124],"evaluations":[125],"FLEURS":[127],"CoVoST2":[129],"show":[130],"CTC-GMM":[133],"approach":[134],"increase":[136],"relatively":[139],"13.9%":[141],"6.4%":[143],"respectively,":[144],"while":[145],"also":[146],"boosting":[147],"decoding":[148],"speed":[149],"59.7%":[151],"on":[152],"GPU.":[153]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
