{"id":"https://openalex.org/W4415428829","doi":"https://doi.org/10.3233/faia250794","title":"Toward Realistic Co-Speech Motion via Cross-Modal Spatial-Temporal Attention and Hand Memory Module","display_name":"Toward Realistic Co-Speech Motion via Cross-Modal Spatial-Temporal Attention and Hand Memory Module","publication_year":2025,"publication_date":"2025-10-21","ids":{"openalex":"https://openalex.org/W4415428829","doi":"https://doi.org/10.3233/faia250794"},"language":null,"primary_location":{"id":"doi:10.3233/faia250794","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia250794","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"type":"book-chapter","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.3233/faia250794","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Jiye Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136793","display_name":"Peng Cheng Laboratory","ror":"https://ror.org/03qdqbt06","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210136793"]},{"id":"https://openalex.org/I75689368","display_name":"Communication University of China","ror":"https://ror.org/04facbs33","country_code":"CN","type":"education","lineage":["https://openalex.org/I75689368"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiye Zhang","raw_affiliation_strings":["Pengcheng Laboratory, Shenzhen, China","School of Information and Communication Engineering, Communication University of China, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Pengcheng Laboratory, Shenzhen, China","institution_ids":["https://openalex.org/I4210136793"]},{"raw_affiliation_string":"School of Information and Communication Engineering, Communication University of China, Beijing, China","institution_ids":["https://openalex.org/I75689368"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052946583","display_name":"Dingwei Liu","orcid":"https://orcid.org/0000-0001-8402-4162"},"institutions":[{"id":"https://openalex.org/I4210136793","display_name":"Peng Cheng Laboratory","ror":"https://ror.org/03qdqbt06","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210136793"]},{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dingwei Liu","raw_affiliation_strings":["Pengcheng Laboratory, Shenzhen, China","School of Future Technology, South China University of Technology, Guangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Pengcheng Laboratory, Shenzhen, China","institution_ids":["https://openalex.org/I4210136793"]},{"raw_affiliation_string":"School of Future Technology, South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047649165","display_name":"Guibiao Liao","orcid":"https://orcid.org/0000-0002-5714-1926"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guibiao Liao","raw_affiliation_strings":["School of Electronic and Computer Engineering, Peking University, Shenzhen, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Electronic and Computer Engineering, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001235027","display_name":"Xiuhua Jiang","orcid":"https://orcid.org/0000-0002-2422-942X"},"institutions":[{"id":"https://openalex.org/I75689368","display_name":"Communication University of China","ror":"https://ror.org/04facbs33","country_code":"CN","type":"education","lineage":["https://openalex.org/I75689368"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiuhua Jiang","raw_affiliation_strings":["School of Information and Communication Engineering, Communication University of China, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Information and Communication Engineering, Communication University of China, Beijing, China","institution_ids":["https://openalex.org/I75689368"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5009842584","display_name":"Jiangbo Xu","orcid":"https://orcid.org/0009-0001-1003-2175"},"institutions":[{"id":"https://openalex.org/I75689368","display_name":"Communication University of China","ror":"https://ror.org/04facbs33","country_code":"CN","type":"education","lineage":["https://openalex.org/I75689368"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiangbo Xu","raw_affiliation_strings":["School of Information and Communication Engineering, Communication University of China, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Information and Communication Engineering, Communication University of China, Beijing, China","institution_ids":["https://openalex.org/I75689368"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.47116447,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9839000105857849,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9839000105857849,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11398","display_name":"Hand Gesture Recognition Systems","score":0.9768000245094299,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13382","display_name":"Robotics and Automated Systems","score":0.9409000277519226,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/gesture","display_name":"Gesture","score":0.7692999839782715},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.5967000126838684},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.553600013256073},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5202999711036682},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.45660001039505005},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.44179999828338623},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.4269999861717224},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.42170000076293945},{"id":"https://openalex.org/keywords/movement","display_name":"Movement (music)","score":0.4115000069141388}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8337000012397766},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.7692999839782715},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6353999972343445},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.5967000126838684},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.553600013256073},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5321000218391418},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5202999711036682},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.45660001039505005},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.44179999828338623},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.4269999861717224},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.42170000076293945},{"id":"https://openalex.org/C2780226923","wikidata":"https://www.wikidata.org/wiki/Q929848","display_name":"Movement (music)","level":2,"score":0.4115000069141388},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.37119999527931213},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.3562000095844269},{"id":"https://openalex.org/C159437735","wikidata":"https://www.wikidata.org/wiki/Q1519524","display_name":"Gesture recognition","level":3,"score":0.34299999475479126},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3158000111579895},{"id":"https://openalex.org/C2777708103","wikidata":"https://www.wikidata.org/wiki/Q852589","display_name":"Motion blur","level":3,"score":0.30239999294281006},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.2944999933242798},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2930999994277954},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.2904999852180481},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.27889999747276306},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.2734000086784363},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.2671000063419342},{"id":"https://openalex.org/C194969405","wikidata":"https://www.wikidata.org/wiki/Q170519","display_name":"Virtual reality","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2623000144958496},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.2565999925136566},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.2540999948978424}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.3233/faia250794","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia250794","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"}],"best_oa_location":{"id":"doi:10.3233/faia250794","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia250794","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Co-speech":[0],"video":[1,115],"generation":[2,45,156],"focuses":[3],"on":[4,184],"improving":[5],"the":[6,37,68,99,106,120,155,185],"authenticity":[7],"of":[8,64,87,125],"virtual":[9],"characters":[10],"by":[11],"aligning":[12],"their":[13],"gestures":[14],"and":[15,31,58,80,89,105,113,123,180,187,203,213],"facial":[16],"expressions":[17],"with":[18,28,54,200],"spoken":[19],"audio.":[20],"Despite":[21],"recent":[22],"advancements,":[23],"existing":[24,194],"methods":[25,195],"often":[26],"struggle":[27],"speech-gesture":[29],"misalignment":[30],"unnatural":[32],"hand":[33,88,205],"motions.":[34],"To":[35],"address":[36],"issues,":[38],"we":[39,128],"propose":[40],"a":[41,50,130,138,145],"novel":[42],"audio-driven":[43],"gesture":[44,147],"framework.":[46],"This":[47,135],"framework":[48],"integrates":[49],"hierarchical":[51],"diffusion":[52],"model":[53],"multimodal":[55],"feature":[56],"disentanglement":[57],"dynamic":[59],"fusion":[60],"strategies.":[61],"The":[62],"core":[63],"our":[65,158],"approach":[66],"is":[67],"Cross-modal":[69],"Spatial-Temporal":[70],"Attention":[71],"mechanism":[72],"(CSTA),":[73],"which":[74],"ensures":[75],"high-fidelity":[76],"synchronization":[77],"between":[78,101],"audio":[79,107],"human":[81],"motion":[82,95,178],"while":[83],"capturing":[84],"fine-grained":[85],"dynamics":[86],"facial.":[90],"By":[91,149],"effectively":[92],"disentangling":[93],"different":[94],"modalities,":[96],"CSTA":[97,192],"enhances":[98,162],"alignment":[100],"body":[102],"part":[103],"movements":[104],"signal,":[108],"leading":[109],"to":[110,118,143],"more":[111,201],"natural":[112,204],"coherent":[114],"synthesis.":[116],"Furthermore,":[117],"improve":[119],"physical":[121],"plausibility":[122],"diversity":[124],"generated":[126],"gestures,":[127],"introduce":[129],"Hand":[131],"Memory":[132],"Module":[133],"(HMM).":[134],"module":[136],"leverages":[137],"Vector":[139],"Quantization-Variational":[140],"Autoencoder":[141],"(VQ-VAE)":[142],"learn":[144],"discrete":[146],"prior.":[148],"embedding":[150],"these":[151],"learned":[152],"priors":[153],"during":[154],"process,":[157],"method":[159],"not":[160],"only":[161],"temporal":[163],"consistency":[164],"but":[165],"also":[166],"preserves":[167],"intricate":[168],"details,":[169],"mitigating":[170],"common":[171],"issues":[172],"in":[173,196,210],"prior":[174],"work":[175],"such":[176],"as":[177],"blur":[179],"detail":[181],"loss.":[182],"Experiments":[183],"PATS":[186],"BEAT2":[188],"datasets":[189],"demonstrate":[190],"that":[191],"surpasses":[193],"generating":[197],"co-speech":[198],"videos":[199],"synchronized":[202],"motions,":[206],"achieving":[207],"state-of-the-art":[208],"performance":[209],"both":[211],"qualitative":[212],"quantitative":[214],"evaluations.":[215],"Project":[216],"Page:":[217],"CSTA-HMM":[218]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-24T00:00:00"}
