{"id":"https://openalex.org/W4377001520","doi":"https://doi.org/10.1109/tvcg.2023.3276973","title":"Audio2Gestures: Generating Diverse Gestures From Audio","display_name":"Audio2Gestures: Generating Diverse Gestures From Audio","publication_year":2023,"publication_date":"2023-05-17","ids":{"openalex":"https://openalex.org/W4377001520","doi":"https://doi.org/10.1109/tvcg.2023.3276973","pmid":"https://pubmed.ncbi.nlm.nih.gov/37195841"},"language":"en","primary_location":{"id":"doi:10.1109/tvcg.2023.3276973","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tvcg.2023.3276973","pdf_url":null,"source":{"id":"https://openalex.org/S84775595","display_name":"IEEE Transactions on Visualization and Computer Graphics","issn_l":"1077-2626","issn":["1077-2626","1941-0506","2160-9306"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Visualization and Computer Graphics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5091968714","display_name":"Jing Li","orcid":"https://orcid.org/0000-0002-2162-1004"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jing Li","raw_affiliation_strings":["Harbin Institute of Technology, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Shenzhen, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007545410","display_name":"Di Kang","orcid":"https://orcid.org/0000-0002-8996-0897"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Di Kang","raw_affiliation_strings":["Tencent AI Lab, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab, Shenzhen, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078487642","display_name":"Wenjie Pei","orcid":"https://orcid.org/0000-0001-8117-2696"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenjie Pei","raw_affiliation_strings":["Harbin Institute of Technology, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Shenzhen, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044236581","display_name":"Xuefei Zhe","orcid":"https://orcid.org/0000-0002-5005-7166"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuefei Zhe","raw_affiliation_strings":["Tencent AI Lab, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab, Shenzhen, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100386145","display_name":"Ying Zhang","orcid":"https://orcid.org/0000-0002-6005-4989"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Zhang","raw_affiliation_strings":["Tencent AI Lab, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab, Shenzhen, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079937813","display_name":"Linchao Bao","orcid":"https://orcid.org/0000-0001-9543-3754"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Linchao Bao","raw_affiliation_strings":["Tencent AI Lab, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab, Shenzhen, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100740564","display_name":"Zhenyu He","orcid":"https://orcid.org/0000-0002-2546-8721"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenyu He","raw_affiliation_strings":["Harbin Institute of Technology, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Shenzhen, China","institution_ids":["https://openalex.org/I204983213"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5091968714"],"corresponding_institution_ids":["https://openalex.org/I204983213"],"apc_list":null,"apc_paid":null,"fwci":1.6065,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.85464531,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":"30","issue":"8","first_page":"4752","last_page":"4766"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8152918219566345},{"id":"https://openalex.org/keywords/gesture","display_name":"Gesture","score":0.6365461945533752},{"id":"https://openalex.org/keywords/motion-capture","display_name":"Motion capture","score":0.5604897141456604},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.5301156044006348},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5225505232810974},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.5097336173057556},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5013854503631592},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4849400818347931},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3926602005958557},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.09206908941268921}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8152918219566345},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.6365461945533752},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.5604897141456604},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.5301156044006348},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5225505232810974},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.5097336173057556},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5013854503631592},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4849400818347931},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3926602005958557},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.09206908941268921},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/tvcg.2023.3276973","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tvcg.2023.3276973","pdf_url":null,"source":{"id":"https://openalex.org/S84775595","display_name":"IEEE Transactions on Visualization and Computer Graphics","issn_l":"1077-2626","issn":["1077-2626","1941-0506","2160-9306"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Visualization and Computer Graphics","raw_type":"journal-article"},{"id":"pmid:37195841","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/37195841","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on visualization and computer graphics","raw_type":null},{"id":"pmh:oai:pure.atira.dk:openaire_cris_publications/b31976f0-452e-4d6a-ab8d-807057742754","is_oa":false,"landing_page_url":"https://research.birmingham.ac.uk/en/publications/b31976f0-452e-4d6a-ab8d-807057742754","pdf_url":null,"source":{"id":"https://openalex.org/S4306402634","display_name":"University of Birmingham Research Portal (University of Birmingham)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I79619799","host_organization_name":"University of Birmingham","host_organization_lineage":["https://openalex.org/I79619799"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Li , J , Kang , D , Pei , W , Zhe , X , Zhang , Y , Bao , L & He , Z 2023 , ' Audio2Gestures : Generating Diverse Gestures From Audio ' , IEEE Transactions on Visualization and Computer Graphics . https://doi.org/10.1109/TVCG.2023.3276973","raw_type":"article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.4399999976158142,"display_name":"Peace, Justice and strong institutions"}],"awards":[{"id":"https://openalex.org/G2286116475","display_name":null,"funder_award_id":"JCYJ20220818102415032","funder_id":"https://openalex.org/F4320329791","funder_display_name":"Shenzhen Fundamental Research Program"},{"id":"https://openalex.org/G5832365334","display_name":null,"funder_award_id":"62006060","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6652300243","display_name":null,"funder_award_id":"U2013210","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8156069797","display_name":null,"funder_award_id":"62172126","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320329791","display_name":"Shenzhen Fundamental Research Program","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":75,"referenced_works":["https://openalex.org/W1735317348","https://openalex.org/W2035046981","https://openalex.org/W2103869314","https://openalex.org/W2133665775","https://openalex.org/W2157331557","https://openalex.org/W2191779130","https://openalex.org/W2194775991","https://openalex.org/W2296371640","https://openalex.org/W2467604901","https://openalex.org/W2559085405","https://openalex.org/W2780124704","https://openalex.org/W2792643794","https://openalex.org/W2792764867","https://openalex.org/W2887738788","https://openalex.org/W2945629925","https://openalex.org/W2949924544","https://openalex.org/W2962785568","https://openalex.org/W2962795401","https://openalex.org/W2963092440","https://openalex.org/W2963165299","https://openalex.org/W2963185411","https://openalex.org/W2963306805","https://openalex.org/W2963890275","https://openalex.org/W2963995996","https://openalex.org/W2964203186","https://openalex.org/W2967443589","https://openalex.org/W2978956737","https://openalex.org/W2981802563","https://openalex.org/W2992005611","https://openalex.org/W3002101995","https://openalex.org/W3009042479","https://openalex.org/W3034600949","https://openalex.org/W3034603995","https://openalex.org/W3048550213","https://openalex.org/W3083173864","https://openalex.org/W3097945073","https://openalex.org/W3102619627","https://openalex.org/W3107914916","https://openalex.org/W3125775899","https://openalex.org/W3133090439","https://openalex.org/W3153551559","https://openalex.org/W3167478287","https://openalex.org/W4221142137","https://openalex.org/W4230429791","https://openalex.org/W4285981714","https://openalex.org/W4288079574","https://openalex.org/W4294568686","https://openalex.org/W4295312788","https://openalex.org/W4303448003","https://openalex.org/W4312437946","https://openalex.org/W4312674262","https://openalex.org/W4313145975","https://openalex.org/W4361755841","https://openalex.org/W4377010269","https://openalex.org/W4385245566","https://openalex.org/W4386075984","https://openalex.org/W6631190155","https://openalex.org/W6631943919","https://openalex.org/W6640963894","https://openalex.org/W6679436768","https://openalex.org/W6687506355","https://openalex.org/W6715501732","https://openalex.org/W6718140377","https://openalex.org/W6720208624","https://openalex.org/W6745992979","https://openalex.org/W6749029207","https://openalex.org/W6749825310","https://openalex.org/W6752114883","https://openalex.org/W6752910514","https://openalex.org/W6756690370","https://openalex.org/W6765779288","https://openalex.org/W6766978945","https://openalex.org/W6767134779","https://openalex.org/W6768124327","https://openalex.org/W6779661269"],"related_works":["https://openalex.org/W1827696521","https://openalex.org/W2173450654","https://openalex.org/W2039848376","https://openalex.org/W2621720158","https://openalex.org/W2091722187","https://openalex.org/W2006196742","https://openalex.org/W2130272765","https://openalex.org/W2055991023","https://openalex.org/W2682927604","https://openalex.org/W2182037499"],"abstract_inverted_index":{"People":[0],"may":[1],"perform":[2],"diverse":[3,102,160],"gestures":[4],"affected":[5],"by":[6,63],"various":[7],"mental":[8],"and":[9,34,72,134,148,159,167,180,191,221],"physical":[10],"factors":[11],"when":[12],"speaking":[13],"the":[14,39,59,65,84,92,95,110,114,142,208,245],"same":[15],"sentences.":[16],"This":[17],"inherent":[18],"one-to-many":[19,60],"relationship":[20],"makes":[21],"co-speech":[22],"gesture":[23],"generation":[24],"from":[25],"audio":[26,93],"particularly":[27],"challenging.":[28],"Conventional":[29],"CNNs/RNNs":[30],"assume":[31],"one-to-one":[32],"mapping,":[33],"thus":[35],"tend":[36],"to":[37,56,80,91,100,139,236],"predict":[38],"average":[40],"of":[41,109],"all":[42],"possible":[43],"target":[44],"motions,":[45],"easily":[46],"resulting":[47,216],"in":[48,217],"plain/boring":[49],"motions":[50,161],"during":[51],"inference.":[52],"So":[53],"we":[54,195,227],"propose":[55],"explicitly":[57],"model":[58],"audio-to-motion":[61],"mapping":[62],"splitting":[64,113],"cross-modal":[66],"latent":[67,115],"code":[68,71,77,97,116],"into":[69,117],"shared":[70,76],"motion-specific":[73,96],"code.":[74],"The":[75],"is":[78,88,98,106,172],"expected":[79,99],"be":[81,233],"responsible":[82],"for":[83,188],"motion":[85,103,130,150,189,193,219,224,238,242],"component":[86],"that":[87,105,153,201,229],"more":[89,107,157,222],"correlated":[90],"while":[94],"capture":[101],"information":[104],"independent":[108],"audio.":[111],"However,":[112],"two":[118],"parts":[119],"poses":[120],"extra":[121],"training":[122,126],"difficulties.":[123],"Several":[124],"crucial":[125],"losses/strategies,":[127],"including":[128],"relaxed":[129],"loss,":[131,136],"bicycle":[132],"constraint,":[133],"diversity":[135],"are":[137],"designed":[138],"better":[140,218],"train":[141],"VAE.":[143],"Experiments":[144],"on":[145,244],"both":[146],"3D":[147],"2D":[149],"datasets":[151],"verify":[152],"our":[154,170,230],"method":[155,231],"generates":[156],"realistic":[158],"than":[162],"previous":[163],"state-of-the-art":[164],"methods,":[165],"quantitatively":[166],"qualitatively.":[168],"Besides,":[169],"formulation":[171],"compatible":[173],"with":[174,240],"discrete":[175],"cosine":[176],"transformation":[177],"(DCT)":[178],"modeling":[179],"other":[181],"popular":[182],"backbones":[183],"(i.e.,":[184],"RNN,":[185],"Transformer).":[186],"As":[187],"losses":[190,213],"quantitative":[192],"evaluation,":[194],"find":[196],"structured":[197],"losses/metrics":[198],"(e.g.":[199,214],"STFT)":[200],"consider":[202],"temporal":[203],"and/or":[204],"spatial":[205],"context":[206],"complement":[207],"most":[209],"commonly":[210],"used":[211,235],"point-wise":[212],"PCK),":[215],"dynamics":[220],"nuanced":[223],"details.":[225],"Finally,":[226],"demonstrate":[228],"can":[232],"readily":[234],"generate":[237],"sequences":[239],"user-specified":[241],"clips":[243],"timeline.":[246]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":4}],"updated_date":"2026-03-05T09:29:38.588285","created_date":"2025-10-10T00:00:00"}
