{"id":"https://openalex.org/W4401597893","doi":"https://doi.org/10.1109/icasspw62465.2024.10625827","title":"SpeechCLIP+: Self-Supervised Multi-Task Representation Learning for Speech Via Clip and Speech-Image Data","display_name":"SpeechCLIP+: Self-Supervised Multi-Task Representation Learning for Speech Via Clip and Speech-Image Data","publication_year":2024,"publication_date":"2024-04-14","ids":{"openalex":"https://openalex.org/W4401597893","doi":"https://doi.org/10.1109/icasspw62465.2024.10625827"},"language":"en","primary_location":{"id":"doi:10.1109/icasspw62465.2024.10625827","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icasspw62465.2024.10625827","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110941853","display_name":"Hsuan-Fu Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]},{"id":"https://openalex.org/I4210098366","display_name":"Institute of Information Science, Academia Sinica","ror":"https://ror.org/00z83z196","country_code":"TW","type":"facility","lineage":["https://openalex.org/I4210098366","https://openalex.org/I84653119"]}],"countries":["TW"],"is_corresponding":true,"raw_author_name":"Hsuan-Fu Wang","raw_affiliation_strings":["Academia Sinica,Institute of Information Science,Taiwan","National Taiwan University,Taiwan"],"affiliations":[{"raw_affiliation_string":"Academia Sinica,Institute of Information Science,Taiwan","institution_ids":["https://openalex.org/I4210098366"]},{"raw_affiliation_string":"National Taiwan University,Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053466746","display_name":"Yi-Jen Shih","orcid":"https://orcid.org/0000-0003-3481-3117"},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yi-Jen Shih","raw_affiliation_strings":["The University of Texas at Austin,USA"],"affiliations":[{"raw_affiliation_string":"The University of Texas at Austin,USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078976109","display_name":"Heng-Jui Chang","orcid":"https://orcid.org/0000-0002-1690-2610"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Heng-Jui Chang","raw_affiliation_strings":["Massachusetts Institute of Technology,USA"],"affiliations":[{"raw_affiliation_string":"Massachusetts Institute of Technology,USA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084236961","display_name":"Layne Berry","orcid":null},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Layne Berry","raw_affiliation_strings":["The University of Texas at Austin,USA"],"affiliations":[{"raw_affiliation_string":"The University of Texas at Austin,USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075735963","display_name":"Puyuan Peng","orcid":null},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Puyuan Peng","raw_affiliation_strings":["The University of Texas at Austin,USA"],"affiliations":[{"raw_affiliation_string":"The University of Texas at Austin,USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040508737","display_name":"Hung-yi Lee","orcid":"https://orcid.org/0000-0002-9654-5747"},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Hung-Yi Lee","raw_affiliation_strings":["National Taiwan University,Taiwan"],"affiliations":[{"raw_affiliation_string":"National Taiwan University,Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071214181","display_name":"Hsin\u2010Min Wang","orcid":"https://orcid.org/0000-0003-3599-5071"},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]},{"id":"https://openalex.org/I4210098366","display_name":"Institute of Information Science, Academia Sinica","ror":"https://ror.org/00z83z196","country_code":"TW","type":"facility","lineage":["https://openalex.org/I4210098366","https://openalex.org/I84653119"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Hsin-Min Wang","raw_affiliation_strings":["Academia Sinica,Institute of Information Science,Taiwan","National Taiwan University,Taiwan"],"affiliations":[{"raw_affiliation_string":"Academia Sinica,Institute of Information Science,Taiwan","institution_ids":["https://openalex.org/I4210098366"]},{"raw_affiliation_string":"National Taiwan University,Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5004717608","display_name":"David Harwath","orcid":"https://orcid.org/0000-0003-0206-0253"},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"David Harwath","raw_affiliation_strings":["The University of Texas at Austin,USA"],"affiliations":[{"raw_affiliation_string":"The University of Texas at Austin,USA","institution_ids":["https://openalex.org/I86519309"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5110941853"],"corresponding_institution_ids":["https://openalex.org/I16733864","https://openalex.org/I4210098366"],"apc_list":null,"apc_paid":null,"fwci":1.0878,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.80961443,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"465","last_page":"469"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9807000160217285,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9807000160217285,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9761999845504761,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9053999781608582,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8240524530410767},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6597967743873596},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6422885060310364},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6038458347320557},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.517026960849762},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.46796613931655884}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8240524530410767},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6597967743873596},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6422885060310364},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6038458347320557},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.517026960849762},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.46796613931655884},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icasspw62465.2024.10625827","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icasspw62465.2024.10625827","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1905882502","https://openalex.org/W2586850765","https://openalex.org/W2752796333","https://openalex.org/W2963799213","https://openalex.org/W2964001192","https://openalex.org/W2972943112","https://openalex.org/W2979476256","https://openalex.org/W3016167541","https://openalex.org/W3036601975","https://openalex.org/W3166396011","https://openalex.org/W3197580070","https://openalex.org/W3200287550","https://openalex.org/W3203140070","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4221145109","https://openalex.org/W4221161768","https://openalex.org/W4224875474","https://openalex.org/W4281492411","https://openalex.org/W4283712703","https://openalex.org/W4319862278","https://openalex.org/W4319862404","https://openalex.org/W4319862477","https://openalex.org/W4372267276","https://openalex.org/W6677994088","https://openalex.org/W6769196770","https://openalex.org/W6780218876","https://openalex.org/W6791353385","https://openalex.org/W6809593508","https://openalex.org/W6810007534","https://openalex.org/W6811013733"],"related_works":["https://openalex.org/W2062195135","https://openalex.org/W2795079307","https://openalex.org/W2793058541","https://openalex.org/W1983629434","https://openalex.org/W2055929693","https://openalex.org/W4324271173","https://openalex.org/W1967645776","https://openalex.org/W2352227742","https://openalex.org/W4390679071","https://openalex.org/W3204019825"],"abstract_inverted_index":{"The":[0,88],"recently":[1],"proposed":[2],"visually":[3],"grounded":[4],"speech":[5,14,94],"model":[6,102,108],"SpeechCLIP":[7,71,101,107],"is":[8,80],"an":[9],"innovative":[10],"framework":[11],"that":[12,63,91],"bridges":[13],"and":[15,67,85],"text":[16,24],"through":[17,117],"images":[18],"via":[19],"CLIP":[20],"without":[21],"relying":[22],"on":[23,82],"transcription.":[25],"On":[26],"this":[27,29],"basis,":[28],"paper":[30],"introduces":[31],"two":[32],"extensions":[33],"to":[34,44],"SpeechCLIP.":[35],"First,":[36],"we":[37,57],"apply":[38],"the":[39,53,65,83,93,98,104,125,128],"Continuous":[40],"Integrate-and-Fire":[41],"(CIF)":[42],"module":[43],"replace":[45],"a":[46,59,73,110],"fixed":[47,111],"number":[48,112],"of":[49,70,113,127],"CLS":[50,114],"tokens":[51],"in":[52,92,131],"cascaded":[54,66,100,106,121],"architecture.":[55],"Second,":[56],"propose":[58],"new":[60],"hybrid":[61,119],"architecture":[62],"merges":[64],"parallel":[68,129],"architectures":[69],"into":[72],"multi-task":[74],"learning":[75,123],"framework.":[76],"Our":[77],"experimental":[78],"evaluation":[79],"performed":[81],"Flickr8k":[84],"SpokenCOCO":[86],"datasets.":[87],"results":[89],"show":[90],"keyword":[95],"extraction":[96],"task,":[97],"CIF-based":[99],"outperforms":[103],"previous":[105],"using":[109],"tokens.":[115],"Furthermore,":[116],"our":[118],"architecture,":[120],"task":[122],"boosts":[124],"performance":[126],"branch":[130],"image-speech":[132],"retrieval":[133],"tasks.":[134]},"counts_by_year":[{"year":2025,"cited_by_count":3}],"updated_date":"2025-12-26T23:08:49.675405","created_date":"2025-10-10T00:00:00"}
