{"id":"https://openalex.org/W4408345661","doi":"https://doi.org/10.1109/icassp49660.2025.10888640","title":"ChunkFormer: Masked Chunking Conformer For Long-Form Speech Transcription","display_name":"ChunkFormer: Masked Chunking Conformer For Long-Form Speech Transcription","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408345661","doi":"https://doi.org/10.1109/icassp49660.2025.10888640"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10888640","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888640","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5075095753","display_name":"Khanh Le","orcid":"https://orcid.org/0000-0001-6921-5999"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Khanh Le","raw_affiliation_strings":["ZaloAI,Vietnam"],"affiliations":[{"raw_affiliation_string":"ZaloAI,Vietnam","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083304186","display_name":"Tuan Vu Ho","orcid":"https://orcid.org/0000-0001-6819-0443"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tuan Vu Ho","raw_affiliation_strings":["ZaloAI,Vietnam"],"affiliations":[{"raw_affiliation_string":"ZaloAI,Vietnam","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101507065","display_name":"Dung Tran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dung Tran","raw_affiliation_strings":["Independent Researcher"],"affiliations":[{"raw_affiliation_string":"Independent Researcher","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5051044805","display_name":"Duc Thanh Chau","orcid":null},"institutions":[{"id":"https://openalex.org/I23582244","display_name":"Ho Chi Minh City University of Science","ror":"https://ror.org/05jfbgm49","country_code":"VN","type":"education","lineage":["https://openalex.org/I123565023","https://openalex.org/I23582244"]}],"countries":["VN"],"is_corresponding":false,"raw_author_name":"Duc Thanh Chau","raw_affiliation_strings":["Ho Chi Minh City University of Science"],"affiliations":[{"raw_affiliation_string":"Ho Chi Minh City University of Science","institution_ids":["https://openalex.org/I23582244"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5075095753"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.8414,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.89762241,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9743000268936157,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9743000268936157,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9103000164031982,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/chunking","display_name":"Chunking (psychology)","score":0.7909765839576721},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6687970161437988},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5909108519554138},{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.5469749569892883},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.43243715167045593},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4157545566558838},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.10081356763839722}],"concepts":[{"id":"https://openalex.org/C203357204","wikidata":"https://www.wikidata.org/wiki/Q1089605","display_name":"Chunking (psychology)","level":2,"score":0.7909765839576721},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6687970161437988},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5909108519554138},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.5469749569892883},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.43243715167045593},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4157545566558838},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.10081356763839722},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10888640","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888640","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W2964110616","https://openalex.org/W3095173472","https://openalex.org/W3097777922","https://openalex.org/W3161873870","https://openalex.org/W3197478142","https://openalex.org/W3197813307","https://openalex.org/W4221167707","https://openalex.org/W4372346432","https://openalex.org/W4375869390","https://openalex.org/W4385245566","https://openalex.org/W4385822293","https://openalex.org/W4391021542","https://openalex.org/W4392903330","https://openalex.org/W4402112207","https://openalex.org/W6776048684","https://openalex.org/W6781533629","https://openalex.org/W6838276489"],"related_works":["https://openalex.org/W2384729545","https://openalex.org/W2198395236","https://openalex.org/W2800417007","https://openalex.org/W4245487161","https://openalex.org/W2090755435","https://openalex.org/W2039036070","https://openalex.org/W2153813398","https://openalex.org/W3204019825","https://openalex.org/W2028097510","https://openalex.org/W2505877856"],"abstract_inverted_index":{"Deploying":[0],"ASR":[1,71,166],"models":[2,173],"at":[3],"an":[4,41,69,97],"industrial":[5],"scale":[6],"poses":[7],"significant":[8],"challenges":[9],"in":[10,138,155,175],"hardware":[11],"resource":[12,58],"management,":[13],"especially":[14],"for":[15,23,136,161,172],"long-form":[16,110],"transcription":[17,111],"tasks":[18,128],"where":[19],"audio":[20,39,83,95],"may":[21],"last":[22],"hours.":[24],"Large":[25],"Conformer":[26],"models,":[27],"despite":[28],"their":[29],"capabilities,":[30],"are":[31],"limited":[32],"to":[33,54,91,115,130],"processing":[34,76],"only":[35],"15":[36],"minutes":[37],"of":[38,94,165],"on":[40,85,96,119,126],"80GB":[42,98],"GPU.":[43],"Furthermore,":[44],"variable":[45],"input":[46],"lengths":[47],"worsen":[48],"inefficiencies,":[49],"as":[50],"standard":[51,139],"batching":[52,143],"leads":[53],"excessive":[55],"padding,":[56],"increasing":[57],"consumption":[59],"and":[60,123,148],"execution":[61,146],"time.":[62],"To":[63],"address":[64],"this,":[65],"we":[66],"introduce":[67],"ChunkFormer,":[68],"efficient":[70],"model":[72],"that":[73],"uses":[74],"chunk-wise":[75],"with":[77,113],"relative":[78],"right":[79],"context,":[80],"enabling":[81],"long":[82],"transcriptions":[84],"low-memory":[86],"GPUs.":[87],"ChunkFormer":[88],"handles":[89],"up":[90,114],"16":[92],"hours":[93],"GPU,":[99],"1.5x":[100],"longer":[101],"than":[102,153],"the":[103,134],"current":[104],"state-of-the-art":[105],"FastConformer,":[106],"while":[107],"also":[108],"boosting":[109],"performance":[112],"7.7%":[116],"absolute":[117],"reduction":[118],"word":[120],"error":[121],"rate":[122],"maintaining":[124],"accuracy":[125],"shorter":[127],"compared":[129],"Conformer.":[131],"By":[132],"eliminating":[133],"need":[135],"padding":[137],"batching,":[140],"ChunkFormer\u2019s":[141],"masked":[142],"technique":[144],"reduces":[145],"time":[147],"memory":[149],"usage":[150],"by":[151],"more":[152],"3x":[154],"batch":[156],"processing,":[157],"substantially":[158],"reducing":[159],"costs":[160],"a":[162],"wide":[163],"range":[164],"systems,":[167],"particularly":[168],"regarding":[169],"GPU":[170],"resources":[171],"serving":[174],"real-world":[176],"applications.":[177]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-23T23:11:35.936235","created_date":"2025-10-10T00:00:00"}
