{"id":"https://openalex.org/W4392909624","doi":"https://doi.org/10.1109/icassp48485.2024.10447797","title":"Learning Speech Representation from Contrastive Token-Acoustic Pretraining","display_name":"Learning Speech Representation from Contrastive Token-Acoustic Pretraining","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392909624","doi":"https://doi.org/10.1109/icassp48485.2024.10447797"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10447797","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10447797","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5028353824","display_name":"Chunyu Qiang","orcid":"https://orcid.org/0009-0007-2290-3074"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]},{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chunyu Qiang","raw_affiliation_strings":["Tianjin University,School of New Media and Communication,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","Kuaishou Technology Co., Ltd, Beijing, China","School of New Media and Communication, Tianjin University, Tianjin, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tianjin University,School of New Media and Communication,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]},{"raw_affiliation_string":"School of New Media and Communication, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100348721","display_name":"Hao Li","orcid":"https://orcid.org/0009-0005-4319-9026"},"institutions":[{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]},{"id":"https://openalex.org/I4210155967","display_name":"OriginWater (China)","ror":"https://ror.org/04h7gmn81","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210155967"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Li","raw_affiliation_strings":["Kuaishou Technology Co., Ltd,Beijing,China","Kuaishou Technology Co., Ltd, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Kuaishou Technology Co., Ltd,Beijing,China","institution_ids":["https://openalex.org/I4210155967","https://openalex.org/I4401726859"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100528371","display_name":"Yixin Tian","orcid":null},"institutions":[{"id":"https://openalex.org/I4210155967","display_name":"OriginWater (China)","ror":"https://ror.org/04h7gmn81","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210155967"]},{"id":"https://openalex.org/I4401726859","display_name":"Kuaishou (China)","ror":"https://ror.org/0258as409","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726859"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yixin Tian","raw_affiliation_strings":["Kuaishou Technology Co., Ltd,Beijing,China","Kuaishou Technology Co., Ltd, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Kuaishou Technology Co., Ltd,Beijing,China","institution_ids":["https://openalex.org/I4210155967","https://openalex.org/I4401726859"]},{"raw_affiliation_string":"Kuaishou Technology Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4401726859"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073918837","display_name":"Ruibo Fu","orcid":"https://orcid.org/0000-0001-9598-1881"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruibo Fu","raw_affiliation_strings":["Chinese Academy of Sciences,Institute of Automation,Beijing,China","Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100453420","display_name":"Tao Wang","orcid":"https://orcid.org/0000-0001-8099-9704"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Wang","raw_affiliation_strings":["Chinese Academy of Sciences,Institute of Automation,Beijing,China","Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,Institute of Automation,Beijing,China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050763764","display_name":"Longbiao Wang","orcid":"https://orcid.org/0000-0002-4005-5036"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longbiao Wang","raw_affiliation_strings":["Tianjin University,School of New Media and Communication,Tianjin,China","School of New Media and Communication, Tianjin University, Tianjin, China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tianjin University,School of New Media and Communication,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"School of New Media and Communication, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017251198","display_name":"Jianwu Dang","orcid":"https://orcid.org/0000-0002-9237-4821"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianwu Dang","raw_affiliation_strings":["Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5028353824"],"corresponding_institution_ids":["https://openalex.org/I162868743","https://openalex.org/I4401726859"],"apc_list":null,"apc_paid":null,"fwci":3.1992,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.92431532,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"10196","last_page":"10200"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9801999926567078,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9801999926567078,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9660000205039978,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.957099974155426,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7812455296516418},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7538328170776367},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.610699474811554},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.49179700016975403},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3920813202857971},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3419577479362488}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7812455296516418},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7538328170776367},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.610699474811554},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.49179700016975403},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3920813202857971},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3419577479362488},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10447797","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10447797","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.46000000834465027,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W2518172956","https://openalex.org/W2752782242","https://openalex.org/W2979476256","https://openalex.org/W3036601975","https://openalex.org/W3176445421","https://openalex.org/W3197411683","https://openalex.org/W3198533616","https://openalex.org/W3209059054","https://openalex.org/W3215626407","https://openalex.org/W4226033575","https://openalex.org/W4281492411","https://openalex.org/W4284898017","https://openalex.org/W4307323391","https://openalex.org/W4372260402","https://openalex.org/W4372266552","https://openalex.org/W4390075359","https://openalex.org/W4392903524","https://openalex.org/W6631190155","https://openalex.org/W6769196770","https://openalex.org/W6780218876","https://openalex.org/W6790019176","https://openalex.org/W6791353385","https://openalex.org/W6803872405","https://openalex.org/W6847363464"],"related_works":["https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W4288261899","https://openalex.org/W4307309205","https://openalex.org/W2967478618","https://openalex.org/W4385009901","https://openalex.org/W4385572700","https://openalex.org/W2997152889","https://openalex.org/W4387768015","https://openalex.org/W3204019825"],"abstract_inverted_index":{"For":[0],"fine-grained":[1,64,184],"generation":[2,185],"and":[3,14,32,53,75,117,140,152,166,173,186],"recognition":[4,17,187],"tasks":[5,189],"such":[6,49],"as":[7,27,50],"minimally-supervised":[8,170],"text-to-speech":[9],"(TTS),":[10],"voice":[11],"conversion":[12],"(VC),":[13],"automatic":[15],"speech":[16,24,68,141,153,165,191],"(ASR),":[18],"the":[19,46,97,155],"intermediate":[20,65,86],"representations":[21,66,87],"extracted":[22],"from":[23,37,67,70,88],"should":[25,56],"serve":[26],"a":[28,81,126,143,180,195],"\"bridge\"":[29],"between":[30],"text":[31],"acoustic":[33,54],"information,":[34],"containing":[35],"information":[36,48,105],"both":[38],"modalities.":[39,90],"The":[40,158,175],"semantic":[41],"content":[42],"is":[43,80,161],"emphasized,":[44],"while":[45],"paralinguistic":[47],"speaker":[51],"identity":[52],"details":[55],"be":[57],"de-emphasized.":[58],"However,":[59,91],"existing":[60,92],"methods":[61,95],"for":[62,84,106,114,183],"extracting":[63,102],"suffer":[69],"issues":[71],"of":[72],"excessive":[73],"redundancy":[74],"dimension":[76],"explosion.":[77],"Contrastive":[78],"learning":[79,94,147],"good":[82],"method":[83,127,178],"modeling":[85],"two":[89,135],"contrastive":[93],"in":[96,190],"audio":[98,108,198],"field":[99],"focus":[100],"on":[101,163],"global":[103],"descriptive":[104],"downstream":[107,188],"classification":[109],"tasks,":[110],"making":[111],"them":[112],"unsuitable":[113],"TTS,":[115,171],"VC,":[116,172],"ASR":[118],"tasks.":[119],"To":[120],"address":[121],"these":[122],"issues,":[123],"we":[124],"propose":[125],"named":[128],"\"Contrastive":[129],"Token-Acoustic":[130],"Pretraining":[131],"(CTAP)\",":[132],"which":[133],"uses":[134],"encoders":[136],"to":[137,149],"bring":[138],"phoneme":[139,151,167],"into":[142],"joint":[144],"multimodal":[145],"space,":[146],"how":[148],"connect":[150],"at":[154],"frame":[156],"level.":[157],"CTAP":[159,177],"model":[160],"trained":[162],"210k":[164],"pairs,":[168],"achieving":[169],"ASR.":[174],"proposed":[176],"offers":[179],"promising":[181],"solution":[182],"processing.":[192],"We":[193],"provide":[194],"website":[196],"with":[197],"samples.":[199],"<sup":[200],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[201],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[202]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":3}],"updated_date":"2026-05-14T08:36:36.166977","created_date":"2025-10-10T00:00:00"}
