{"id":"https://openalex.org/W4408353525","doi":"https://doi.org/10.1109/icassp49660.2025.10888940","title":"CJST: CTC Compressor based Joint Speech and Text Training for Decoder-Only ASR","display_name":"CJST: CTC Compressor based Joint Speech and Text Training for Decoder-Only ASR","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408353525","doi":"https://doi.org/10.1109/icassp49660.2025.10888940"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10888940","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888940","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5026526428","display_name":"Wei Zhou","orcid":"https://orcid.org/0000-0003-3622-3970"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wei Zhou","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113970008","display_name":"Junteng Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junteng Jia","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003052411","display_name":"Leda Sar\u0131","orcid":"https://orcid.org/0000-0002-3754-1156"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Leda Sari","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074237839","display_name":"Jay Mahadeokar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jay Mahadeokar","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066166549","display_name":"Ozlem Kalinli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ozlem Kalinli","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5026526428"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.8599,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.89834698,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9742000102996826,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9742000102996826,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7887424230575562},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7303995490074158},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.6502040028572083},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.4242337942123413},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.4213681221008301},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3942517638206482},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3602217435836792},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.087455153465271},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.05322301387786865}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7887424230575562},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7303995490074158},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.6502040028572083},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.4242337942123413},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.4213681221008301},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3942517638206482},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3602217435836792},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.087455153465271},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.05322301387786865},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C170154142","wikidata":"https://www.wikidata.org/wiki/Q150737","display_name":"Architectural engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10888940","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888940","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2127141656","https://openalex.org/W2144499799","https://openalex.org/W2327501763","https://openalex.org/W2407080277","https://openalex.org/W2962826786","https://openalex.org/W2963250244","https://openalex.org/W3016234571","https://openalex.org/W3097777922","https://openalex.org/W3153583341","https://openalex.org/W3163560333","https://openalex.org/W4225308107","https://openalex.org/W4226120743","https://openalex.org/W4319862474","https://openalex.org/W4384918448","https://openalex.org/W4388979610","https://openalex.org/W4389524500","https://openalex.org/W4390041933","https://openalex.org/W4391021666","https://openalex.org/W4392903956","https://openalex.org/W4392910583","https://openalex.org/W4392979802","https://openalex.org/W4401042284","https://openalex.org/W4402111955","https://openalex.org/W6637373629","https://openalex.org/W6691770337","https://openalex.org/W6853998256","https://openalex.org/W6861618871","https://openalex.org/W6861626678"],"related_works":["https://openalex.org/W230091440","https://openalex.org/W2233261550","https://openalex.org/W2810751659","https://openalex.org/W258997015","https://openalex.org/W2997094352","https://openalex.org/W3216976533","https://openalex.org/W100620283","https://openalex.org/W2161474341","https://openalex.org/W2495260952","https://openalex.org/W4366179611"],"abstract_inverted_index":{"CTC":[0,30,63,73,118,144],"compressor":[1,31,145],"can":[2],"be":[3],"an":[4,90],"effective":[5,91],"approach":[6],"to":[7,11,101,142],"integrate":[8],"audio":[9],"encoders":[10],"decoder-only":[12,41,147],"models,":[13],"which":[14,136],"has":[15],"gained":[16],"growing":[17],"interest":[18],"for":[19,40,105,146],"different":[20],"speech":[21,34,45],"applications.":[22],"In":[23],"this":[24],"work,":[25],"we":[26],"propose":[27],"a":[28,54,114],"novel":[29],"based":[32],"joint":[33],"and":[35,46,58,72,81,108,127,132],"text":[36,47,92],"training":[37],"(CJST)":[38],"framework":[39],"ASR.":[42],"CJST":[43,88],"matches":[44],"modalities":[48],"from":[49],"both":[50,106,130],"directions":[51],"by":[52],"exploring":[53],"simple":[55],"modality":[56],"adaptor":[57],"several":[59],"features":[60],"of":[61,97],"the":[62,79,86,95,102,138],"compressor,":[64,119],"including":[65],"sequence":[66],"compression,":[67],"on-the-fly":[68],"forced":[69],"peaky":[70],"alignment":[71],"class":[74],"embeddings.":[75],"Experimental":[76],"results":[77],"on":[78,117],"Librispeech":[80],"TED-LIUM2":[82],"corpora":[83],"show":[84],"that":[85],"proposed":[87],"achieves":[89],"injection":[93],"without":[94],"need":[96],"duration":[98],"handling,":[99],"leading":[100],"best":[103],"performance":[104],"in-domain":[107],"cross-domain":[109],"scenarios.":[110],"We":[111],"also":[112],"provide":[113],"comprehensive":[115],"study":[116],"covering":[120],"various":[121],"compression":[122],"modes,":[123],"edge":[124],"case":[125],"handling":[126],"behavior":[128],"under":[129],"clean":[131],"noisy":[133],"data":[134],"conditions,":[135],"reveals":[137],"most":[139],"robust":[140],"setting":[141],"use":[143],"models.":[148]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-22T23:10:17.713674","created_date":"2025-10-10T00:00:00"}
