{"id":"https://openalex.org/W7148421105","doi":"https://doi.org/10.1109/asru65441.2025.11434643","title":"Long-Form Fuzzy Speech-to-Text Alignment for 1000+ Languages","display_name":"Long-Form Fuzzy Speech-to-Text Alignment for 1000+ Languages","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148421105","doi":"https://doi.org/10.1109/asru65441.2025.11434643"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434643","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434643","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5007094260","display_name":"Ruizhe Huang","orcid":"https://orcid.org/0009-0005-6031-3775"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ruizhe Huang","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132811032","display_name":"Xiaohui Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaohui Zhang","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132816482","display_name":"Zhaoheng Ni","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhaoheng Ni","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025983396","display_name":"Moto Hira","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Moto Hira","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021004964","display_name":"Jeff Hwang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jeff Hwang","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127805542","display_name":"Vineel Pratap","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vineel Pratap","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132794021","display_name":"Ju Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ju Lin","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132804150","display_name":"Ming Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ming Sun","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5132810008","display_name":"Florian Metze","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Florian Metze","raw_affiliation_strings":["Meta,USA"],"affiliations":[{"raw_affiliation_string":"Meta,USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5007094260"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.87564034,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"3"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.5034000277519226,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.5034000277519226,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.21070000529289246,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.030300000682473183,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/fuzzy-logic","display_name":"Fuzzy logic","score":0.5008000135421753},{"id":"https://openalex.org/keywords/fuzzy-control-system","display_name":"Fuzzy control system","score":0.32710000872612},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.319599986076355},{"id":"https://openalex.org/keywords/fuzzy-set","display_name":"Fuzzy set","score":0.2883000075817108},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.27000001072883606}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6140999794006348},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5098000168800354},{"id":"https://openalex.org/C58166","wikidata":"https://www.wikidata.org/wiki/Q224821","display_name":"Fuzzy logic","level":2,"score":0.5008000135421753},{"id":"https://openalex.org/C195975749","wikidata":"https://www.wikidata.org/wiki/Q1475705","display_name":"Fuzzy control system","level":3,"score":0.32710000872612},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.319599986076355},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3052999973297119},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2946000099182129},{"id":"https://openalex.org/C42011625","wikidata":"https://www.wikidata.org/wiki/Q1055058","display_name":"Fuzzy set","level":3,"score":0.2883000075817108},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.27219998836517334},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.27070000767707825},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.27000001072883606},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.26260000467300415},{"id":"https://openalex.org/C148671577","wikidata":"https://www.wikidata.org/wiki/Q5511133","display_name":"Fuzzy set operations","level":4,"score":0.250900000333786},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434643","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434643","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W1526908311","https://openalex.org/W2069893028","https://openalex.org/W2130200371","https://openalex.org/W2145410271","https://openalex.org/W2747874407","https://openalex.org/W4226491018","https://openalex.org/W4385822293","https://openalex.org/W4386566728","https://openalex.org/W4391021755","https://openalex.org/W4392902865","https://openalex.org/W4404781824","https://openalex.org/W7131801114"],"related_works":[],"abstract_inverted_index":{"Conventional":[0],"speech-to-text":[1,73],"forced":[2],"alignment":[3],"typically":[4],"operates":[5],"at":[6],"the":[7,30,49,63,68,94,117],"utterance":[8],"level.":[9],"In":[10],"practice,":[11],"however,":[12],"we":[13],"do":[14],"not":[15],"usually":[16],"have":[17],"short":[18],"segments":[19],"(e.g.,":[20,29,43],"10":[21],"seconds)":[22],"of":[23],"audio":[24,38,85],"with":[25,62,108],"exact,":[26],"verbatim":[27],"transcriptions":[28],"LibriSpeech":[31],"corpus)":[32],"as":[33],"in":[34,41],"lab":[35],"conditions.":[36],"Instead,":[37],"often":[39],"comes":[40],"long-form":[42,71,96],"an":[44],"hour-long":[45],"lecture":[46],"recording),":[47],"and":[48,122],"available":[50],"transcription":[51],"may":[52],"be":[53,106],"non-verbatim":[54],"or":[55],"include":[56],"unspoken":[57],"annotations,":[58],"making":[59],"it":[60,104],"misaligned":[61],"actual":[64],"speech.":[65],"This":[66],"motivates":[67],"need":[69],"for":[70,80,87],"fuzzy":[72],"alignment,":[74],"which":[75,98],"has":[76],"practical":[77],"applications":[78],"-":[79],"example,":[81],"preparing":[82],"segmented":[83],"supervised":[84],"data":[86],"training":[88],"machine":[89],"learning":[90],"models.":[91],"We":[92],"demonstrate":[93],"Torchaudio":[95],"aligner,":[97],"supports":[99],"such":[100],"use":[101],"cases.":[102],"Moreover,":[103],"can":[105],"equipped":[107],"any":[109],"CTC":[110],"model":[111,118],"that":[112],"predicts":[113],"frame-wise":[114],"labels,":[115],"turning":[116],"into":[119],"a":[120],"robust":[121],"powerful":[123],"aligner.":[124]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
