{"id":"https://openalex.org/W7148324329","doi":"https://doi.org/10.1109/asru65441.2025.11434635","title":"Whisper Has an Internal Word Aligner","display_name":"Whisper Has an Internal Word Aligner","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148324329","doi":"https://doi.org/10.1109/asru65441.2025.11434635"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434635","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434635","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132793474","display_name":"Sung-Lin Yeh","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Sung-Lin Yeh","raw_affiliation_strings":["University of Edinburgh,Centre for Speech Technology Research,UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh,Centre for Speech Technology Research,UK","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132811583","display_name":"Yen Meng","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Yen Meng","raw_affiliation_strings":["University of Edinburgh,Centre for Speech Technology Research,UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh,Centre for Speech Technology Research,UK","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5132789106","display_name":"Hao Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Hao Tang","raw_affiliation_strings":["University of Edinburgh,Centre for Speech Technology Research,UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh,Centre for Speech Technology Research,UK","institution_ids":["https://openalex.org/I98677209"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5132793474"],"corresponding_institution_ids":["https://openalex.org/I98677209"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.87553125,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.3391999900341034,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.3391999900341034,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.23389999568462372,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.1485999971628189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.8019999861717224},{"id":"https://openalex.org/keywords/timestamp","display_name":"Timestamp","score":0.5472000241279602},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4925999939441681},{"id":"https://openalex.org/keywords/forcing","display_name":"Forcing (mathematics)","score":0.41510000824928284},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.34940001368522644}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.840399980545044},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.8019999861717224},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5999000072479248},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5644999742507935},{"id":"https://openalex.org/C113954288","wikidata":"https://www.wikidata.org/wiki/Q186885","display_name":"Timestamp","level":2,"score":0.5472000241279602},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4925999939441681},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.42329999804496765},{"id":"https://openalex.org/C197115733","wikidata":"https://www.wikidata.org/wiki/Q1003136","display_name":"Forcing (mathematics)","level":2,"score":0.41510000824928284},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.34940001368522644},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.3206000030040741},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.27480000257492065},{"id":"https://openalex.org/C2983335612","wikidata":"https://www.wikidata.org/wiki/Q54277","display_name":"Word processing","level":2,"score":0.26339998841285706}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434635","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434635","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7415545582771301}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1635512741","https://openalex.org/W2153773386","https://openalex.org/W2294901625","https://openalex.org/W2327501763","https://openalex.org/W2577366047","https://openalex.org/W2747874407","https://openalex.org/W2748816379","https://openalex.org/W2808682925","https://openalex.org/W2955019563","https://openalex.org/W2962824709","https://openalex.org/W2963979492","https://openalex.org/W2970726176","https://openalex.org/W3016185664","https://openalex.org/W3043783436","https://openalex.org/W3096017728","https://openalex.org/W3142589538","https://openalex.org/W3161049737","https://openalex.org/W3181776258","https://openalex.org/W4206075291","https://openalex.org/W4385571111","https://openalex.org/W4385822293","https://openalex.org/W4385823471","https://openalex.org/W4392902865","https://openalex.org/W4402111558","https://openalex.org/W4402112406","https://openalex.org/W4402115913","https://openalex.org/W4402115972","https://openalex.org/W4403674676","https://openalex.org/W4406461319","https://openalex.org/W4408353883","https://openalex.org/W4411113454"],"related_works":[],"abstract_inverted_index":{"There":[0],"is":[1,34,145],"an":[2,91],"increasing":[3],"interest":[4],"in":[5,15,31,54],"obtaining":[6],"accurate":[7,58,80,124],"word-level":[8],"timestamps":[9],"from":[10,65],"strong":[11],"automatic":[12],"speech":[13],"recognizers,":[14],"particular":[16],"Whisper.":[17],"Existing":[18],"approaches":[19],"either":[20],"require":[21,114],"additional":[22],"training":[23,115],"or":[24],"are":[25,62,122],"simply":[26],"not":[27,110,113],"competitive.":[28],"The":[29],"evaluation":[30],"prior":[32,126],"work":[33,127],"also":[35,117],"relatively":[36],"loose,":[37],"typically":[38],"using":[39,74,83],"a":[40,129],"tolerance":[41,131],"of":[42],"more":[43,79,123],"than":[44,82,125],"200":[45],"ms.":[46],"In":[47],"this":[48],"work,":[49],"we":[50,71,89],"discover":[51],"attention":[52,100],"heads":[53,101],"Whisper":[55,105],"that":[56,67,73,121],"capture":[57],"word":[59,96,119],"alignments":[60,81,97,120],"and":[61,78,135],"distinctively":[63],"different":[64],"those":[66],"do":[68],"not.":[69],"Moreover,":[70],"find":[72],"characters":[75],"produces":[76,118],"finer":[77],"wordpieces.":[84],"Based":[85],"on":[86],"these":[87],"findings,":[88],"propose":[90],"unsupervised":[92],"approach":[93,109],"to":[94],"extracting":[95],"by":[98],"filtering":[99],"while":[102],"teacher":[103],"forcing":[104],"with":[106],"characters.":[107],"Our":[108],"only":[111],"does":[112],"but":[116],"under":[128],"stricter":[130],"between":[132],"20":[133],"ms":[134],"$100":[136],"\\mathrm{~ms}$.":[137],"<sup":[138],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[139,141],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup><sup":[140],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>The":[142],"source":[143],"code":[144],"available":[146],"at":[147],"https://github.com/30stomercury/whisper-char-alignment":[148]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
