{"id":"https://openalex.org/W4392903583","doi":"https://doi.org/10.1109/icassp48485.2024.10446893","title":"Generative Context-Aware Fine-Tuning of Self-Supervised Speech Models","display_name":"Generative Context-Aware Fine-Tuning of Self-Supervised Speech Models","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392903583","doi":"https://doi.org/10.1109/icassp48485.2024.10446893"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446893","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446893","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5020263683","display_name":"Suwon Shon","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Suwon Shon","raw_affiliation_strings":["ASAPP"],"affiliations":[{"raw_affiliation_string":"ASAPP","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052377938","display_name":"Kwangyoun Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kwangyoun Kim","raw_affiliation_strings":["ASAPP"],"affiliations":[{"raw_affiliation_string":"ASAPP","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006917911","display_name":"Prashant Sridhar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Prashant Sridhar","raw_affiliation_strings":["ASAPP"],"affiliations":[{"raw_affiliation_string":"ASAPP","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111487406","display_name":"Yi\u2010Te Hsu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi-Te Hsu","raw_affiliation_strings":["ASAPP"],"affiliations":[{"raw_affiliation_string":"ASAPP","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001291873","display_name":"Shinji Watanabe","orcid":"https://orcid.org/0000-0002-5970-8631"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Carnegie Mellon University"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5015602781","display_name":"Karen Livescu","orcid":"https://orcid.org/0000-0003-4962-946X"},"institutions":[{"id":"https://openalex.org/I160992636","display_name":"Toyota Technological Institute at Chicago","ror":"https://ror.org/02sn5gb64","country_code":"US","type":"education","lineage":["https://openalex.org/I160992636"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Karen Livescu","raw_affiliation_strings":["Toyota Technological Institute at Chicago"],"affiliations":[{"raw_affiliation_string":"Toyota Technological Institute at Chicago","institution_ids":["https://openalex.org/I160992636"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5020263683"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.7274,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.73084357,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"11156","last_page":"11160"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8668519258499146},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.7221976518630981},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.6591101288795471},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6577177047729492},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.6382107734680176},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6309934258460999},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.6140182018280029},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6033987998962402},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.5459851026535034},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5353591442108154},{"id":"https://openalex.org/keywords/fine-tuning","display_name":"Fine-tuning","score":0.5122896432876587},{"id":"https://openalex.org/keywords/context-model","display_name":"Context model","score":0.46080777049064636},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4514561593532562},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3220406770706177}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8668519258499146},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.7221976518630981},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.6591101288795471},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6577177047729492},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.6382107734680176},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6309934258460999},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.6140182018280029},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6033987998962402},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.5459851026535034},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5353591442108154},{"id":"https://openalex.org/C157524613","wikidata":"https://www.wikidata.org/wiki/Q2828883","display_name":"Fine-tuning","level":2,"score":0.5122896432876587},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.46080777049064636},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4514561593532562},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3220406770706177},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446893","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446893","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2933138175","https://openalex.org/W2946764509","https://openalex.org/W2972430654","https://openalex.org/W2995181338","https://openalex.org/W3005910077","https://openalex.org/W3034999214","https://openalex.org/W3036601975","https://openalex.org/W3197898596","https://openalex.org/W3211686893","https://openalex.org/W4221144554","https://openalex.org/W4226103796","https://openalex.org/W4283835659","https://openalex.org/W4288089799","https://openalex.org/W4322718191","https://openalex.org/W4372260611","https://openalex.org/W4372349774","https://openalex.org/W4384918448","https://openalex.org/W4385822425","https://openalex.org/W6769627184","https://openalex.org/W6780218876","https://openalex.org/W6803557570","https://openalex.org/W6850625674","https://openalex.org/W6854866820"],"related_works":["https://openalex.org/W2529301793","https://openalex.org/W2384121599","https://openalex.org/W3023285645","https://openalex.org/W3037551068","https://openalex.org/W3023594376","https://openalex.org/W4287802662","https://openalex.org/W3096664139","https://openalex.org/W4385890381","https://openalex.org/W3165571652","https://openalex.org/W4309877123"],"abstract_inverted_index":{"When":[0],"performing":[1],"tasks":[2],"like":[3,67],"automatic":[4,153],"speech":[5,95,154],"recognition":[6],"or":[7,19,64,69,122],"spoken":[8],"language":[9,35],"understanding":[10],"for":[11,149],"a":[12,58,132,170,185],"given":[13],"utterance,":[14],"access":[15,116],"to":[16,86,100,111,117,123],"preceding":[17,50],"text":[18,66],"audio":[20],"provides":[21],"contextual":[22],"information":[23,47,81,90],"that":[24,40,165,175,191],"can":[25],"improve":[26],"performance.":[27],"Considering":[28],"the":[29,49,61,76,88,108,118,124,140,144,177,193],"recent":[30],"advances":[31],"in":[32],"generative":[33,102,166,186],"large":[34],"models":[36],"(LLM),":[37],"we":[38,74,98],"hypothesize":[39],"an":[41,84],"LLM":[42,55,125,194],"could":[43,56],"generate":[44,57],"useful":[45],"context":[46,80,136,171,187],"using":[48,143],"text.":[51],"With":[52],"appropriate":[53],"prompts,":[54],"prediction":[59],"of":[60,78,93],"next":[62],"sentence":[63],"abstractive":[65],"titles":[68],"topics.":[70],"In":[71],"this":[72],"paper,":[73],"study":[75],"use":[77],"LLM-generated":[79],"and":[82,146,159,181],"propose":[83],"approach":[85,106,142,174,190],"distill":[87],"generated":[89],"during":[91],"fine-tuning":[92,168,173,189],"self-supervised":[94],"models,":[96],"which":[97],"refer":[99],"as":[101],"context-aware":[103,167],"fine-tuning.":[104],"This":[105],"allows":[107],"fine-tuned":[109],"model":[110],"make":[112],"improved":[113],"predictions":[114],"without":[115],"true":[119],"surrounding":[120],"segments":[121],"at":[126,195],"inference":[127,196],"time,":[128],"while":[129],"requiring":[130],"only":[131],"very":[133],"small":[134],"additional":[135],"module.":[137],"We":[138],"evaluate":[139],"proposed":[141],"SLUE":[145],"Libri-light":[147],"benchmarks":[148],"several":[150],"downstream":[151],"tasks:":[152],"recognition,":[155,158],"named":[156],"entity":[157],"sentiment":[160],"analysis.":[161],"The":[162],"results":[163],"show":[164],"outperforms":[169],"injection":[172,188],"accesses":[176],"ground-truth":[178],"previous":[179],"text,":[180],"is":[182],"competitive":[183],"with":[184],"requires":[192],"time.":[197]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
