{"id":"https://openalex.org/W4392903389","doi":"https://doi.org/10.1109/icassp48485.2024.10447523","title":"FunCodec: A Fundamental, Reproducible and Integrable Open-Source Toolkit for Neural Speech Codec","display_name":"FunCodec: A Fundamental, Reproducible and Integrable Open-Source Toolkit for Neural Speech Codec","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392903389","doi":"https://doi.org/10.1109/icassp48485.2024.10447523"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10447523","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10447523","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001133136","display_name":"Zhihao Du","orcid":"https://orcid.org/0000-0003-3509-9322"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhihao Du","raw_affiliation_strings":["Alibaba Group,Speech Lab of DAMO Academy,China","Speech Lab of DAMO Academy, Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Speech Lab of DAMO Academy,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Speech Lab of DAMO Academy, Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101777591","display_name":"Shiliang Zhang","orcid":"https://orcid.org/0000-0002-9524-1602"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiliang Zhang","raw_affiliation_strings":["Alibaba Group,Speech Lab of DAMO Academy,China","Speech Lab of DAMO Academy, Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Speech Lab of DAMO Academy,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Speech Lab of DAMO Academy, Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048675276","display_name":"Kai Hu","orcid":"https://orcid.org/0000-0001-7181-9935"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Hu","raw_affiliation_strings":["Alibaba Group,Speech Lab of DAMO Academy,China","Speech Lab of DAMO Academy, Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Speech Lab of DAMO Academy,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Speech Lab of DAMO Academy, Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101700479","display_name":"Siqi Zheng","orcid":"https://orcid.org/0000-0002-5481-7900"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Siqi Zheng","raw_affiliation_strings":["Alibaba Group,Speech Lab of DAMO Academy,China","Speech Lab of DAMO Academy, Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Speech Lab of DAMO Academy,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Speech Lab of DAMO Academy, Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5001133136"],"corresponding_institution_ids":["https://openalex.org/I45928872"],"apc_list":null,"apc_paid":null,"fwci":10.6805,"has_fulltext":false,"cited_by_count":31,"citation_normalized_percentile":{"value":0.98609929,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"591","last_page":"595"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8502349853515625},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.8035213947296143},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.6154366135597229},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5765058994293213},{"id":"https://openalex.org/keywords/codec2","display_name":"Codec2","score":0.5340579748153687},{"id":"https://openalex.org/keywords/scripting-language","display_name":"Scripting language","score":0.5272970199584961},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.45177122950553894},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.43955421447753906},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.43443530797958374},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4297619163990021},{"id":"https://openalex.org/keywords/adaptive-multi-rate-audio-codec","display_name":"Adaptive Multi-Rate audio codec","score":0.424007385969162},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.42385342717170715},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.42342543601989746},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.37510785460472107},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.1153901219367981},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.11462140083312988},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.10808587074279785}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8502349853515625},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.8035213947296143},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.6154366135597229},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5765058994293213},{"id":"https://openalex.org/C75217168","wikidata":"https://www.wikidata.org/wiki/Q1105653","display_name":"Codec2","level":4,"score":0.5340579748153687},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.5272970199584961},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.45177122950553894},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.43955421447753906},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.43443530797958374},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4297619163990021},{"id":"https://openalex.org/C177067256","wikidata":"https://www.wikidata.org/wiki/Q4676210","display_name":"Adaptive Multi-Rate audio codec","level":4,"score":0.424007385969162},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.42385342717170715},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.42342543601989746},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37510785460472107},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.1153901219367981},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.11462140083312988},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.10808587074279785}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10447523","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10447523","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Gender equality","id":"https://metadata.un.org/sdg/5","score":0.6899999976158142}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W1481955708","https://openalex.org/W1494198834","https://openalex.org/W1524333225","https://openalex.org/W1728888090","https://openalex.org/W2064675550","https://openalex.org/W2752796333","https://openalex.org/W2798405286","https://openalex.org/W2889048668","https://openalex.org/W2963242190","https://openalex.org/W2963799213","https://openalex.org/W2970006822","https://openalex.org/W2972359262","https://openalex.org/W3092028330","https://openalex.org/W3095095816","https://openalex.org/W3163243746","https://openalex.org/W3198694222","https://openalex.org/W3203407300","https://openalex.org/W3209059054","https://openalex.org/W3215615641","https://openalex.org/W4205788663","https://openalex.org/W4221159457","https://openalex.org/W4307323391","https://openalex.org/W4313679638","https://openalex.org/W4372190822","https://openalex.org/W4372260101","https://openalex.org/W4372270198","https://openalex.org/W4372348514","https://openalex.org/W4375869380","https://openalex.org/W4375869436","https://openalex.org/W4378501656","https://openalex.org/W4380551955","https://openalex.org/W4381827575","https://openalex.org/W4385245566","https://openalex.org/W6631362777","https://openalex.org/W6639363673","https://openalex.org/W6754473786","https://openalex.org/W6767111847","https://openalex.org/W6783867762","https://openalex.org/W6848735303","https://openalex.org/W6853515095","https://openalex.org/W6853611000","https://openalex.org/W6853998256"],"related_works":["https://openalex.org/W137020845","https://openalex.org/W1542588102","https://openalex.org/W2496295964","https://openalex.org/W281432198","https://openalex.org/W2106874932","https://openalex.org/W2151333624","https://openalex.org/W2289505355","https://openalex.org/W1911859126","https://openalex.org/W2336887028","https://openalex.org/W1578591928"],"abstract_inverted_index":{"This":[0,144],"paper":[1],"presents":[2],"FunCodec,":[3,62],"a":[4],"fundamental":[5],"neural":[6,32],"speech":[7,17,33,58,93,138],"codec":[8,34,86],"toolkit,":[9,80],"which":[10,68,89],"is":[11,146],"an":[12],"extension":[13],"of":[14],"the":[15,30,43,79,84,107,128],"open-source":[16],"processing":[18],"toolkit":[19,145],"FunASR.":[20],"FunCodec":[21,48,111],"provides":[22],"reproducible":[23],"training":[24],"recipes":[25],"and":[26,39,99,121,140],"inference":[27],"scripts":[28],"for":[29,72,133],"latest":[31],"models,":[35,87],"such":[36,56],"as":[37,57],"SoundStream":[38],"Encodec.":[40],"Thanks":[41],"to":[42],"unified":[44],"design":[45],"with":[46,61,95,118],"FunASR,":[47],"can":[49,69,90,112],"be":[50,70],"easily":[51],"integrated":[52],"into":[53],"downstream":[54,134],"tasks,":[55,135],"recognition.":[59],"Along":[60],"pretrained":[63],"models":[64,130],"are":[65,131],"also":[66,125],"provided,":[67],"used":[71],"academic":[73],"or":[74],"generalized":[75],"purposes.":[76],"Based":[77],"on":[78],"we":[81],"further":[82],"propose":[83],"frequency-domain":[85],"FreqCodec,":[88],"achieve":[91,113],"comparable":[92],"quality":[94,116],"much":[96],"lower":[97],"computation":[98],"parameter":[100],"complexity.":[101],"Experimental":[102],"results":[103],"show":[104],"that,":[105],"under":[106],"same":[108],"compression":[109],"ratio,":[110],"better":[114],"reconstruction":[115],"compared":[117],"other":[119],"toolkits":[120],"released":[122],"models.":[123],"We":[124],"demonstrate":[126],"that":[127],"pre-trained":[129],"suitable":[132],"including":[136],"automatic":[137],"recognition":[139],"personalized":[141],"text-to-speech":[142],"synthesis.":[143],"publicly":[147],"available":[148],"at":[149],"https://github.com/alibaba-damo-academy/FunCodec.":[150]},"counts_by_year":[{"year":2025,"cited_by_count":21},{"year":2024,"cited_by_count":10}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
