{"id":"https://openalex.org/W4402703639","doi":"https://doi.org/10.1145/3664647.3681674","title":"SpeechCraft: A Fine-Grained Expressive Speech Dataset with Natural Language Description","display_name":"SpeechCraft: A Fine-Grained Expressive Speech Dataset with Natural Language Description","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4402703639","doi":"https://doi.org/10.1145/3664647.3681674"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3681674","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681674","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3664647.3681674","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033137691","display_name":"Zeyu Jin","orcid":"https://orcid.org/0000-0001-8465-8878"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zeyu Jin","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039477812","display_name":"Jia Jia","orcid":"https://orcid.org/0009-0005-8449-278X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jia Jia","raw_affiliation_strings":["BNRist, Tsinghua University &amp; Key Laboratory of Pervasive Computing, Ministry of Education, Beijing, China"],"affiliations":[{"raw_affiliation_string":"BNRist, Tsinghua University &amp; Key Laboratory of Pervasive Computing, Ministry of Education, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101918812","display_name":"Qixin Wang","orcid":"https://orcid.org/0009-0009-5832-8192"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qixin Wang","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000107414","display_name":"Kehan Li","orcid":"https://orcid.org/0009-0001-2487-9107"},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kehan Li","raw_affiliation_strings":["Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111291993","display_name":"Shuoyi Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuoyi Zhou","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China","Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026277472","display_name":"Songtao Zhou","orcid":"https://orcid.org/0009-0008-5972-3955"},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Songtao Zhou","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China","Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018252335","display_name":"Xiaoyu Qin","orcid":"https://orcid.org/0000-0002-9720-3220"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyu Qin","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102869280","display_name":"Zhiyong Wu","orcid":"https://orcid.org/0000-0001-8533-0524"},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiyong Wu","raw_affiliation_strings":["Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5033137691"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":1.6963,"has_fulltext":true,"cited_by_count":5,"citation_normalized_percentile":{"value":0.86836134,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1255","last_page":"1264"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.84576416015625},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6465218663215637},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.6270647048950195},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.6124626994132996},{"id":"https://openalex.org/keywords/temporal-annotation","display_name":"Temporal annotation","score":0.553530752658844},{"id":"https://openalex.org/keywords/speech-analytics","display_name":"Speech analytics","score":0.5297585129737854},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5054078102111816},{"id":"https://openalex.org/keywords/clips","display_name":"CLIPS","score":0.501255989074707},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.48033228516578674},{"id":"https://openalex.org/keywords/natural-language-understanding","display_name":"Natural language understanding","score":0.4599546194076538},{"id":"https://openalex.org/keywords/speech-corpus","display_name":"Speech corpus","score":0.44347143173217773},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.41602885723114014},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4036242365837097},{"id":"https://openalex.org/keywords/language-technology","display_name":"Language technology","score":0.2853754758834839},{"id":"https://openalex.org/keywords/comprehension-approach","display_name":"Comprehension approach","score":0.10021984577178955}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.84576416015625},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6465218663215637},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.6270647048950195},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.6124626994132996},{"id":"https://openalex.org/C7044111","wikidata":"https://www.wikidata.org/wiki/Q15844891","display_name":"Temporal annotation","level":5,"score":0.553530752658844},{"id":"https://openalex.org/C54953205","wikidata":"https://www.wikidata.org/wiki/Q4142201","display_name":"Speech analytics","level":4,"score":0.5297585129737854},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5054078102111816},{"id":"https://openalex.org/C2778739407","wikidata":"https://www.wikidata.org/wiki/Q165372","display_name":"CLIPS","level":2,"score":0.501255989074707},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48033228516578674},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.4599546194076538},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.44347143173217773},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.41602885723114014},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4036242365837097},{"id":"https://openalex.org/C14919245","wikidata":"https://www.wikidata.org/wiki/Q1976109","display_name":"Language technology","level":4,"score":0.2853754758834839},{"id":"https://openalex.org/C129353971","wikidata":"https://www.wikidata.org/wiki/Q5156949","display_name":"Comprehension approach","level":3,"score":0.10021984577178955},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3664647.3681674","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681674","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2408.13608","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.13608","pdf_url":"https://arxiv.org/pdf/2408.13608","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3664647.3681674","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681674","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.8100000023841858,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G1121271761","display_name":null,"funder_award_id":"Program","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1231421488","display_name":null,"funder_award_id":"under","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1888327543","display_name":null,"funder_award_id":"15001","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2603945996","display_name":null,"funder_award_id":"62076144","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2981938667","display_name":null,"funder_award_id":"Shenzhen","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5249178904","display_name":null,"funder_award_id":"Grant No. 6","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5898128861","display_name":null,"funder_award_id":"61405150","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5939423041","display_name":null,"funder_award_id":"Technology","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7726157001","display_name":null,"funder_award_id":"Grant No.","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W2354870669","https://openalex.org/W2576683119","https://openalex.org/W2964243274","https://openalex.org/W2999905431","https://openalex.org/W3015591594","https://openalex.org/W3033907960","https://openalex.org/W3103314642","https://openalex.org/W3176445421","https://openalex.org/W4210913346","https://openalex.org/W4311000453","https://openalex.org/W4372260310","https://openalex.org/W4375869257","https://openalex.org/W4376226279","https://openalex.org/W4392904245","https://openalex.org/W4393147046","https://openalex.org/W4393152865","https://openalex.org/W6600741150"],"related_works":["https://openalex.org/W2032286903","https://openalex.org/W2036933852","https://openalex.org/W2337605147","https://openalex.org/W2355709873","https://openalex.org/W4388404911","https://openalex.org/W3011457902","https://openalex.org/W2253659301","https://openalex.org/W3011834788","https://openalex.org/W2000167159","https://openalex.org/W3089121518"],"abstract_inverted_index":{"Speech-language":[0],"multi-modal":[1],"learning":[2],"presents":[3,44],"a":[4,18,45,87,102,149],"significant":[5],"challenge":[6],"due":[7],"to":[8,30,95],"the":[9,183],"fine":[10],"nuanced":[11],"information":[12,116],"inherent":[13],"in":[14,191],"speech":[15,25,35,63,72,82,98,125,153,177,193,196],"styles.":[16],"Therefore,":[17],"large-scale":[19,49],"dataset":[20,185],"providing":[21],"elaborate":[22],"comprehension":[23],"of":[24,89,124,169],"style":[26,126,163,197],"is":[27,156],"urgently":[28],"needed":[29],"facilitate":[31],"insightful":[32],"interplay":[33],"between":[34,48],"audio":[36,170],"and":[37,52,76,92,117,135,172,195],"natural":[38,129,161],"language.":[39],"However,":[40],"constructing":[41],"such":[42],"datasets":[43],"major":[46],"trade-off":[47],"data":[50,137,171],"collection":[51],"high-quality":[53],"annotation.":[54],"To":[55],"tackle":[56],"this":[57,144],"challenge,":[58],"we":[59,146],"propose":[60],"an":[61],"automatic":[62],"annotation":[64,107,112],"system":[65,120],"for":[66,105,139],"expressiveness":[67],"interpretation":[68],"that":[69,182],"annotates":[70],"in-the-wild":[71],"clips":[73],"with":[74,114],"expressive":[75,152],"vivid":[77],"human":[78],"language":[79,130,162],"descriptions.":[80],"Initially,":[81],"audios":[83],"are":[84],"processed":[85],"by":[86,101,158],"series":[88],"expert":[90],"classifiers":[91],"captioning":[93],"models":[94],"capture":[96],"diverse":[97],"characteristics,":[99],"followed":[100],"fine-tuned":[103],"LLaMA":[104],"customized":[106],"generation.":[108],"Unlike":[109],"previous":[110],"tag/templet-based":[111],"frameworks":[113],"limited":[115],"diversity,":[118],"our":[119],"provides":[121],"in-depth":[122],"understandings":[123],"through":[127],"tailored":[128],"descriptions,":[131],"thereby":[132],"enabling":[133],"accurate":[134],"voluminous":[136],"generation":[138],"large":[140],"model":[141],"training.":[142],"With":[143],"system,":[145],"create":[147],"SpeechCraft,":[148],"fine-grained":[150],"bilingual":[151],"dataset.":[154],"It":[155],"distinguished":[157],"highly":[159],"descriptive":[160],"prompts,":[164],"containing":[165],"approximately":[166],"2,000":[167],"hours":[168],"encompassing":[173],"over":[174],"two":[175],"million":[176],"clips.":[178],"Extensive":[179],"experiments":[180],"demonstrate":[181],"proposed":[184],"significantly":[186],"boosts":[187],"speech-language":[188],"task":[189],"performance":[190],"stylist":[192],"synthesis":[194],"understanding.":[198]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":1}],"updated_date":"2026-04-13T07:58:08.660418","created_date":"2024-09-21T00:00:00"}
