{"id":"https://openalex.org/W4401610816","doi":"https://doi.org/10.1109/icasspw62465.2024.10625802","title":"Integrating Self-Supervised Speech Model with Pseudo Word-Level Targets from Visually-Grounded Speech Model","display_name":"Integrating Self-Supervised Speech Model with Pseudo Word-Level Targets from Visually-Grounded Speech Model","publication_year":2024,"publication_date":"2024-04-14","ids":{"openalex":"https://openalex.org/W4401610816","doi":"https://doi.org/10.1109/icasspw62465.2024.10625802"},"language":"en","primary_location":{"id":"doi:10.1109/icasspw62465.2024.10625802","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icasspw62465.2024.10625802","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030828636","display_name":"Hung-Chieh Fang","orcid":null},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Hung-Chieh Fang","raw_affiliation_strings":["National Taiwan University,Taiwan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Taiwan University,Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073229385","display_name":"Nai-Xuan Ye","orcid":null},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Nai-Xuan Ye","raw_affiliation_strings":["National Taiwan University,Taiwan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Taiwan University,Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053466746","display_name":"Yi-Jen Shih","orcid":"https://orcid.org/0000-0003-3481-3117"},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Yi-Jen Shih","raw_affiliation_strings":["National Taiwan University,Taiwan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Taiwan University,Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075735963","display_name":"Puyuan Peng","orcid":null},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Puyuan Peng","raw_affiliation_strings":["The University of Texas at Austin,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The University of Texas at Austin,USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110941853","display_name":"Hsuan-Fu Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Hsuan-Fu Wang","raw_affiliation_strings":["National Taiwan University,Taiwan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Taiwan University,Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084236961","display_name":"Layne Berry","orcid":null},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Layne Berry","raw_affiliation_strings":["The University of Texas at Austin,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The University of Texas at Austin,USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040508737","display_name":"Hung-yi Lee","orcid":"https://orcid.org/0000-0002-9654-5747"},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Hung-Yi Lee","raw_affiliation_strings":["National Taiwan University,Taiwan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Taiwan University,Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5004717608","display_name":"David Harwath","orcid":"https://orcid.org/0000-0003-0206-0253"},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"David Harwath","raw_affiliation_strings":["The University of Texas at Austin,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The University of Texas at Austin,USA","institution_ids":["https://openalex.org/I86519309"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.3055,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.63639279,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":"33","issue":null,"first_page":"645","last_page":"649"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.289000004529953,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.289000004529953,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.2870999872684479,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.2727000117301941,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7346309423446655},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6115516424179077},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5401178598403931},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.5230401754379272},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4247225522994995},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4231085777282715},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.23118510842323303}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7346309423446655},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6115516424179077},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5401178598403931},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.5230401754379272},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4247225522994995},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4231085777282715},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.23118510842323303},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icasspw62465.2024.10625802","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icasspw62465.2024.10625802","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W569478347","https://openalex.org/W2927673779","https://openalex.org/W2972892814","https://openalex.org/W3036601975","https://openalex.org/W3157861865","https://openalex.org/W3174311593","https://openalex.org/W3196698946","https://openalex.org/W3197580070","https://openalex.org/W3200287550","https://openalex.org/W3209059054","https://openalex.org/W4224875474","https://openalex.org/W4226103796","https://openalex.org/W4226380987","https://openalex.org/W4281492411","https://openalex.org/W4287591426","https://openalex.org/W4297683418","https://openalex.org/W4303649106","https://openalex.org/W4319862278","https://openalex.org/W4319862477","https://openalex.org/W4372267276","https://openalex.org/W4385571440","https://openalex.org/W4385823059","https://openalex.org/W4385823277","https://openalex.org/W4385823492","https://openalex.org/W6751425476","https://openalex.org/W6780218876","https://openalex.org/W6786696081","https://openalex.org/W6810168380","https://openalex.org/W6845951457"],"related_works":["https://openalex.org/W1569283511","https://openalex.org/W2216757598","https://openalex.org/W4387496629","https://openalex.org/W4236193183","https://openalex.org/W3013209356","https://openalex.org/W2053866214","https://openalex.org/W2607505004","https://openalex.org/W3204019825","https://openalex.org/W2296205523","https://openalex.org/W127416991"],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,10,27,50,108],"self-supervised":[3],"speech":[4,82],"models":[5,16],"have":[6],"shown":[7],"significant":[8],"improvement":[9],"many":[11],"downstream":[12],"tasks.":[13],"However,":[14],"these":[15],"predominantly":[17],"centered":[18],"on":[19,40,95],"frame-level":[20],"training":[21,72],"objectives,":[22],"which":[23,47],"can":[24],"fall":[25],"short":[26],"spoken":[28,97],"language":[29,98],"understanding":[30,99],"tasks":[31],"that":[32,65],"require":[33],"semantic":[34,110],"comprehension.":[35],"Existing":[36],"works":[37],"often":[38],"rely":[39],"additional":[41],"speech-text":[42,89],"data":[43],"as":[44],"intermediate":[45],"targets,":[46],"is":[48],"costly":[49],"the":[51,71,75,86,103],"real-world":[52],"setting.":[53],"To":[54],"address":[55],"this":[56],"challenge,":[57],"we":[58],"propose":[59],"Pseudo-Word":[60],"HuBERT":[61],"(PW-HuBERT),":[62],"a":[63,80],"framework":[64],"integrates":[66],"pseudo":[67],"word-level":[68],"targets":[69,76],"into":[70],"process,":[73],"where":[74],"are":[77],"derived":[78],"from":[79],"visually-ground":[81],"model,":[83],"notably":[84],"eliminating":[85],"need":[87],"for":[88],"paired":[90],"data.":[91],"Our":[92],"experimental":[93],"results":[94],"four":[96],"(SLU)":[100],"benchmarks":[101],"suggest":[102],"superiority":[104],"of":[105],"our":[106],"model":[107],"capturing":[109],"information.":[111]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
