{"id":"https://openalex.org/W4417113975","doi":"https://doi.org/10.1145/3769748.3773363","title":"FlowTalk: Real-Time Audio-Driven Talking Head Synthesis via Motion-Space Flow Matching","display_name":"FlowTalk: Real-Time Audio-Driven Talking Head Synthesis via Motion-Space Flow Matching","publication_year":2025,"publication_date":"2025-12-08","ids":{"openalex":"https://openalex.org/W4417113975","doi":"https://doi.org/10.1145/3769748.3773363"},"language":null,"primary_location":{"id":"doi:10.1145/3769748.3773363","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3769748.3773363","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3769748.3773363","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 7th ACM International Conference on Multimedia in Asia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3769748.3773363","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109635081","display_name":"K F Deng","orcid":"https://orcid.org/0009-0009-1352-2952"},"institutions":[{"id":"https://openalex.org/I180726961","display_name":"Shenzhen University","ror":"https://ror.org/01vy4gh70","country_code":"CN","type":"education","lineage":["https://openalex.org/I180726961"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kaijun Deng","raw_affiliation_strings":["Shenzhen University, Shenzhen, China"],"raw_orcid":"https://orcid.org/0009-0009-1352-2952","affiliations":[{"raw_affiliation_string":"Shenzhen University, Shenzhen, China","institution_ids":["https://openalex.org/I180726961"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yuhang Guo","orcid":"https://orcid.org/0009-0009-7301-2284"},"institutions":[{"id":"https://openalex.org/I180726961","display_name":"Shenzhen University","ror":"https://ror.org/01vy4gh70","country_code":"CN","type":"education","lineage":["https://openalex.org/I180726961"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuhang Guo","raw_affiliation_strings":["Shenzhen University, Shenzhen, China"],"raw_orcid":"https://orcid.org/0009-0009-7301-2284","affiliations":[{"raw_affiliation_string":"Shenzhen University, Shenzhen, China","institution_ids":["https://openalex.org/I180726961"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5019313200","display_name":"Linlin Shen","orcid":"https://orcid.org/0000-0003-1420-0815"},"institutions":[{"id":"https://openalex.org/I180726961","display_name":"Shenzhen University","ror":"https://ror.org/01vy4gh70","country_code":"CN","type":"education","lineage":["https://openalex.org/I180726961"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Linlin Shen","raw_affiliation_strings":["Shenzhen University, Shenzhen, China"],"raw_orcid":"https://orcid.org/0000-0003-1420-0815","affiliations":[{"raw_affiliation_string":"Shenzhen University, Shenzhen, China","institution_ids":["https://openalex.org/I180726961"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.34848799,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.713699996471405,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.713699996471405,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.2021999955177307,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.0142000000923872,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.4837000072002411},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.42750000953674316},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.42570000886917114},{"id":"https://openalex.org/keywords/optical-flow","display_name":"Optical flow","score":0.40610000491142273},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.3598000109195709},{"id":"https://openalex.org/keywords/solver","display_name":"Solver","score":0.3537999987602234},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.31769999861717224}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7946000099182129},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5342000126838684},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.49970000982284546},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.4837000072002411},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.42750000953674316},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.42570000886917114},{"id":"https://openalex.org/C155542232","wikidata":"https://www.wikidata.org/wiki/Q736111","display_name":"Optical flow","level":3,"score":0.40610000491142273},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.3598000109195709},{"id":"https://openalex.org/C2778770139","wikidata":"https://www.wikidata.org/wiki/Q1966904","display_name":"Solver","level":2,"score":0.3537999987602234},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.33090001344680786},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.31769999861717224},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.31209999322891235},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.31200000643730164},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.3075999915599823},{"id":"https://openalex.org/C10161872","wikidata":"https://www.wikidata.org/wiki/Q557891","display_name":"Motion estimation","level":2,"score":0.3001999855041504},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2728999853134155},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.25760000944137573},{"id":"https://openalex.org/C57654395","wikidata":"https://www.wikidata.org/wiki/Q1097775","display_name":"Compression artifact","level":5,"score":0.2567000091075897},{"id":"https://openalex.org/C38349280","wikidata":"https://www.wikidata.org/wiki/Q1434290","display_name":"Flow (mathematics)","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3769748.3773363","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3769748.3773363","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3769748.3773363","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 7th ACM International Conference on Multimedia in Asia","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3769748.3773363","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3769748.3773363","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3769748.3773363","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 7th ACM International Conference on Multimedia in Asia","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4600207245","display_name":null,"funder_award_id":"62576216","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4417113975.pdf"},"referenced_works_count":24,"referenced_works":["https://openalex.org/W3019952993","https://openalex.org/W3081492798","https://openalex.org/W3197199219","https://openalex.org/W3209059054","https://openalex.org/W3211147706","https://openalex.org/W4200150166","https://openalex.org/W4212935216","https://openalex.org/W4312301053","https://openalex.org/W4312345918","https://openalex.org/W4382240211","https://openalex.org/W4385287058","https://openalex.org/W4385318467","https://openalex.org/W4386071653","https://openalex.org/W4386072021","https://openalex.org/W4387968147","https://openalex.org/W4390872116","https://openalex.org/W4393148973","https://openalex.org/W4400818936","https://openalex.org/W4403792004","https://openalex.org/W4408352908","https://openalex.org/W4413147355","https://openalex.org/W4413147577","https://openalex.org/W4415540427","https://openalex.org/W4415799309"],"related_works":[],"abstract_inverted_index":{"Audio-driven":[0],"talking":[1,66,212],"head":[2,67,189,213],"synthesis":[3],"has":[4],"achieved":[5],"significant":[6,91],"progress,":[7],"yet":[8],"existing":[9],"methods":[10,38],"face":[11],"critical":[12],"trade-offs":[13],"among":[14],"generation":[15,95,214],"quality,":[16],"inference":[17,31,162],"efficiency,":[18,192],"and":[19,49,110,142,188,207],"cross-ethnic":[20,132],"generalization.":[21,51],"Diffusion-based":[22],"approaches":[23],"produce":[24],"high-fidelity":[25,65],"results":[26,152],"but":[27],"suffer":[28],"from":[29,108],"slow":[30],"due":[32],"to":[33,76,104,121],"iterative":[34],"denoising,":[35],"while":[36,93,178],"GAN-based":[37],"achieve":[39],"faster":[40,171],"speed":[41],"at":[42],"the":[43,148],"cost":[44],"of":[45,140],"reduced":[46],"motion":[47,79,84,102,124],"naturalness":[48],"limited":[50],"To":[52,130],"address":[53],"these":[54],"challenges,":[55],"we":[56,98,134],"propose":[57],"FlowTalk,":[58],"a":[59,82,118,137],"novel":[60],"framework":[61,203],"that":[62,154],"enables":[63,198],"real-time":[64,200,211],"video":[68],"synthesis.":[69],"Our":[70,202],"approach":[71],"leverages":[72],"Flow":[73],"Matching":[74],"technology":[75],"perform":[77],"efficient":[78],"modeling":[80],"in":[81,160,183],"decoupled":[83],"space":[85],"rather":[86],"than":[87,172],"pixel":[88],"space,":[89],"achieving":[90],"speedup":[92],"maintaining":[94],"quality.":[96],"Specifically,":[97],"adopt":[99],"an":[100,112,205],"off-the-shelf":[101],"extractor":[103],"disentangle":[105],"facial":[106,186],"appearance":[107],"motion,":[109],"employ":[111],"OT-based":[113],"flow":[114],"matching":[115],"model":[116],"with":[117,145,163,175],"transformer":[119],"architecture":[120],"predict":[122],"identity-agnostic":[123],"sequences":[125],"conditioned":[126],"on":[127,136],"audio":[128,149],"features.":[129],"improve":[131],"generalization,":[133],"train":[135],"balanced":[138],"combination":[139],"DH-FaceVid-1K":[141],"HDTF":[143],"datasets":[144],"HuBert-CN":[146],"as":[147],"encoder.":[150],"Experimental":[151],"demonstrate":[153],"FlowTalk":[155],"achieves":[156],"over":[157],"100":[158],"FPS":[159],"motion-space":[161],"32":[164],"ODE":[165],"solver":[166],"steps,":[167,177],"approximately":[168],"5":[169],"times":[170],"diffusion-based":[173],"baselines":[174],"500":[176],"preserving":[179],"comparable":[180],"visual":[181],"quality":[182],"lip":[184],"synchronization,":[185],"expressions,":[187],"movements.":[190],"This":[191],"further":[193],"enhanced":[194],"through":[195],"TensorRT":[196],"deployment,":[197],"truly":[199],"generation.":[201],"provides":[204],"effective":[206],"practical":[208],"solution":[209],"for":[210],"applications.":[215]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-12-08T00:00:00"}
