{"id":"https://openalex.org/W7125587198","doi":"https://doi.org/10.1007/s11263-026-02752-z","title":"Cosh-DiT: Co-Speech Gesture Video Synthesis via Hybrid Audio-Visual Diffusion Transformers","display_name":"Cosh-DiT: Co-Speech Gesture Video Synthesis via Hybrid Audio-Visual Diffusion Transformers","publication_year":2026,"publication_date":"2026-01-24","ids":{"openalex":"https://openalex.org/W7125587198","doi":"https://doi.org/10.1007/s11263-026-02752-z"},"language":"en","primary_location":{"id":"doi:10.1007/s11263-026-02752-z","is_oa":false,"landing_page_url":"https://doi.org/10.1007/s11263-026-02752-z","pdf_url":null,"source":{"id":"https://openalex.org/S25538012","display_name":"International Journal of Computer Vision","issn_l":"0920-5691","issn":["0920-5691","1573-1405"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal of Computer Vision","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yasheng Sun","orcid":"https://orcid.org/0000-0002-0589-4424"},"institutions":[{"id":"https://openalex.org/I114531698","display_name":"Tokyo Institute of Technology","ror":"https://ror.org/0112mx960","country_code":"JP","type":"education","lineage":["https://openalex.org/I114531698"]},{"id":"https://openalex.org/I71920554","display_name":"King Abdullah University of Science and Technology","ror":"https://ror.org/01q3tbs38","country_code":"SA","type":"education","lineage":["https://openalex.org/I71920554"]}],"countries":["JP","SA"],"is_corresponding":true,"raw_author_name":"Yasheng Sun","raw_affiliation_strings":["Center of Excellence for Generative AI, King Abdullah University of Science and Technology, Thuwal, 23955-6900, Jeddah, Saudi Arabia","School of Computing, Tokyo Institute of Technology, Ookayama, Meguro-ku, 152-8550, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Center of Excellence for Generative AI, King Abdullah University of Science and Technology, Thuwal, 23955-6900, Jeddah, Saudi Arabia","institution_ids":["https://openalex.org/I71920554"]},{"raw_affiliation_string":"School of Computing, Tokyo Institute of Technology, Ookayama, Meguro-ku, 152-8550, Tokyo, Japan","institution_ids":["https://openalex.org/I114531698"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123743842","display_name":"Zhiliang Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiliang Xu","raw_affiliation_strings":["Baidu VIS, Shangdi 10th Street, 100085, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Baidu VIS, Shangdi 10th Street, 100085, Beijing, China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123765851","display_name":"Hang Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hang Zhou","raw_affiliation_strings":["Baidu VIS, Shangdi 10th Street, 100085, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Baidu VIS, Shangdi 10th Street, 100085, Beijing, China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014800774","display_name":"Jiazhi Guan","orcid":"https://orcid.org/0000-0001-5219-1097"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiazhi Guan","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Haidian District, Shuangqing Road, 100084, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Haidian District, Shuangqing Road, 100084, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123726576","display_name":"Quanwei Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Quanwei Yang","raw_affiliation_strings":["Department of Electronic Engineering and Information Science, University of Science and Technology of China, Baohe District, JinZhai Road, 230026, Hefei, Anhui, China"],"affiliations":[{"raw_affiliation_string":"Department of Electronic Engineering and Information Science, University of Science and Technology of China, Baohe District, JinZhai Road, 230026, Hefei, Anhui, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002386773","display_name":"Kaisiyuan Wang","orcid":"https://orcid.org/0000-0002-2120-8383"},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kaisiyuan Wang","raw_affiliation_strings":["Baidu VIS, Shangdi 10th Street, 100085, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Baidu VIS, Shangdi 10th Street, 100085, Beijing, China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075497345","display_name":"Borong Liang","orcid":"https://orcid.org/0000-0002-3036-6754"},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Borong Liang","raw_affiliation_strings":["Baidu VIS, Shangdi 10th Street, 100085, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Baidu VIS, Shangdi 10th Street, 100085, Beijing, China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123781707","display_name":"Yingying Li","orcid":null},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yingying Li","raw_affiliation_strings":["Baidu VIS, Shangdi 10th Street, 100085, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Baidu VIS, Shangdi 10th Street, 100085, Beijing, China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123763783","display_name":"Haocheng Feng","orcid":null},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haocheng Feng","raw_affiliation_strings":["Baidu VIS, Shangdi 10th Street, 100085, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Baidu VIS, Shangdi 10th Street, 100085, Beijing, China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123747438","display_name":"Jingdong Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingdong Wang","raw_affiliation_strings":["Baidu VIS, Shangdi 10th Street, 100085, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Baidu VIS, Shangdi 10th Street, 100085, Beijing, China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123749848","display_name":"Ziwei Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Ziwei Liu","raw_affiliation_strings":["S-Lab, Nanyang Technological University, 50 Nanyang Avenue, 639798, Nanyang, Singapore"],"affiliations":[{"raw_affiliation_string":"S-Lab, Nanyang Technological University, 50 Nanyang Avenue, 639798, Nanyang, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5123747870","display_name":"Koike Hideki","orcid":null},"institutions":[{"id":"https://openalex.org/I114531698","display_name":"Tokyo Institute of Technology","ror":"https://ror.org/0112mx960","country_code":"JP","type":"education","lineage":["https://openalex.org/I114531698"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Koike Hideki","raw_affiliation_strings":["School of Computing, Tokyo Institute of Technology, Ookayama, Meguro-ku, 152-8550, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"School of Computing, Tokyo Institute of Technology, Ookayama, Meguro-ku, 152-8550, Tokyo, Japan","institution_ids":["https://openalex.org/I114531698"]}]}],"institutions":[],"countries_distinct_count":4,"institutions_distinct_count":12,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I114531698","https://openalex.org/I71920554"],"apc_list":{"value":2890,"currency":"EUR","value_usd":3690},"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.17766885,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"134","issue":"3","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.448199987411499,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.448199987411499,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.2948000133037567,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.07010000199079514,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/gesture","display_name":"Gesture","score":0.8463000059127808},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.652899980545044},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.4595000147819519},{"id":"https://openalex.org/keywords/gesture-recognition","display_name":"Gesture recognition","score":0.4246000051498413},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.2994999885559082},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.29840001463890076}],"concepts":[{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.8463000059127808},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7908999919891357},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.652899980545044},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5648999810218811},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5565000176429749},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.4595000147819519},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4318999946117401},{"id":"https://openalex.org/C159437735","wikidata":"https://www.wikidata.org/wiki/Q1519524","display_name":"Gesture recognition","level":3,"score":0.4246000051498413},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.29840001463890076},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2745000123977661},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.2563000023365021},{"id":"https://openalex.org/C2776449333","wikidata":"https://www.wikidata.org/wiki/Q7928781","display_name":"View synthesis","level":3,"score":0.2547000050544739}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1007/s11263-026-02752-z","is_oa":false,"landing_page_url":"https://doi.org/10.1007/s11263-026-02752-z","pdf_url":null,"source":{"id":"https://openalex.org/S25538012","display_name":"International Journal of Computer Vision","issn_l":"0920-5691","issn":["0920-5691","1573-1405"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal of Computer Vision","raw_type":"journal-article"},{"id":"pmh:oai:dr.ntu.edu.sg:10356/211751","is_oa":false,"landing_page_url":"https://hdl.handle.net/10356/211751","pdf_url":null,"source":{"id":"https://openalex.org/S4306402609","display_name":"DR-NTU (Nanyang Technological University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I172675005","host_organization_name":"Nanyang Technological University","host_organization_lineage":["https://openalex.org/I172675005"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Journal Article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":75,"referenced_works":["https://openalex.org/W215088529","https://openalex.org/W1967554269","https://openalex.org/W1969784320","https://openalex.org/W2008208299","https://openalex.org/W2011611983","https://openalex.org/W2024536104","https://openalex.org/W2133665775","https://openalex.org/W2235920218","https://openalex.org/W2559085405","https://openalex.org/W2768683308","https://openalex.org/W2901285216","https://openalex.org/W2922298118","https://openalex.org/W2949924544","https://openalex.org/W2962785568","https://openalex.org/W2962795401","https://openalex.org/W2963876278","https://openalex.org/W2966421984","https://openalex.org/W2967443589","https://openalex.org/W2969985801","https://openalex.org/W2978956737","https://openalex.org/W3010876998","https://openalex.org/W3012220842","https://openalex.org/W3081492798","https://openalex.org/W3083173864","https://openalex.org/W3092887305","https://openalex.org/W3096831136","https://openalex.org/W3097792222","https://openalex.org/W3098994456","https://openalex.org/W3115266783","https://openalex.org/W3124346429","https://openalex.org/W3176721746","https://openalex.org/W3180355996","https://openalex.org/W3186090335","https://openalex.org/W3194872882","https://openalex.org/W3197139780","https://openalex.org/W3198131199","https://openalex.org/W4200630629","https://openalex.org/W4214886549","https://openalex.org/W4224988866","https://openalex.org/W4230312136","https://openalex.org/W4230429791","https://openalex.org/W4281877842","https://openalex.org/W4283818626","https://openalex.org/W4304080460","https://openalex.org/W4310379947","https://openalex.org/W4312388283","https://openalex.org/W4312437946","https://openalex.org/W4312674262","https://openalex.org/W4312933868","https://openalex.org/W4360846436","https://openalex.org/W4364377334","https://openalex.org/W4385275735","https://openalex.org/W4385284180","https://openalex.org/W4385764101","https://openalex.org/W4386065848","https://openalex.org/W4386071653","https://openalex.org/W4386075984","https://openalex.org/W4386076103","https://openalex.org/W4390190334","https://openalex.org/W4390872297","https://openalex.org/W4390872556","https://openalex.org/W4390874168","https://openalex.org/W4391807605","https://openalex.org/W4396696312","https://openalex.org/W4399206662","https://openalex.org/W4402704510","https://openalex.org/W4402704593","https://openalex.org/W4402727140","https://openalex.org/W4402727178","https://openalex.org/W4402727180","https://openalex.org/W4402753998","https://openalex.org/W4402754063","https://openalex.org/W4402754210","https://openalex.org/W4404965692","https://openalex.org/W4413147584"],"related_works":[],"abstract_inverted_index":null,"counts_by_year":[],"updated_date":"2026-04-04T08:04:53.788161","created_date":"2026-01-25T00:00:00"}
