{"id":"https://openalex.org/W4411015395","doi":"https://doi.org/10.1145/3728725.3728752","title":"Audio-driven Talking-face Synthesis based on 3D Gaussian","display_name":"Audio-driven Talking-face Synthesis based on 3D Gaussian","publication_year":2025,"publication_date":"2025-02-21","ids":{"openalex":"https://openalex.org/W4411015395","doi":"https://doi.org/10.1145/3728725.3728752"},"language":"en","primary_location":{"id":"doi:10.1145/3728725.3728752","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3728725.3728752","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3728725.3728752","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 2nd International Conference on Generative Artificial Intelligence and Information Security","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3728725.3728752","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Botao Xiong","orcid":"https://orcid.org/0009-0003-5902-2423"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Botao Xiong","raw_affiliation_strings":["University of Science and Technology of China, Hefei, Anhui, China"],"raw_orcid":"https://orcid.org/0009-0003-5902-2423","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, Anhui, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.10010619,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"171","last_page":"177"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9921000003814697,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9909999966621399,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.6311514973640442},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.62396240234375},{"id":"https://openalex.org/keywords/gaussian","display_name":"Gaussian","score":0.45743611454963684},{"id":"https://openalex.org/keywords/gaussian-process","display_name":"Gaussian process","score":0.43166399002075195},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3856722116470337},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.36173349618911743},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.08334606885910034},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.0758199691772461}],"concepts":[{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.6311514973640442},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.62396240234375},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.45743611454963684},{"id":"https://openalex.org/C61326573","wikidata":"https://www.wikidata.org/wiki/Q1496376","display_name":"Gaussian process","level":3,"score":0.43166399002075195},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3856722116470337},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36173349618911743},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.08334606885910034},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0758199691772461},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3728725.3728752","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3728725.3728752","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3728725.3728752","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 2nd International Conference on Generative Artificial Intelligence and Information Security","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3728725.3728752","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3728725.3728752","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3728725.3728752","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 2nd International Conference on Generative Artificial Intelligence and Information Security","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4411015395.pdf","grobid_xml":"https://content.openalex.org/works/W4411015395.grobid-xml"},"referenced_works_count":19,"referenced_works":["https://openalex.org/W2099471712","https://openalex.org/W2147885303","https://openalex.org/W2769666294","https://openalex.org/W2884460600","https://openalex.org/W2963073614","https://openalex.org/W2963290645","https://openalex.org/W2979894294","https://openalex.org/W3006410788","https://openalex.org/W3099284785","https://openalex.org/W3109585842","https://openalex.org/W3199414382","https://openalex.org/W3211147706","https://openalex.org/W4200630629","https://openalex.org/W4224929261","https://openalex.org/W4309876056","https://openalex.org/W4367701091","https://openalex.org/W4384811738","https://openalex.org/W4385318467","https://openalex.org/W4386044235"],"related_works":["https://openalex.org/W3188962172","https://openalex.org/W2772917594","https://openalex.org/W4312825515","https://openalex.org/W4306742369","https://openalex.org/W4303457083","https://openalex.org/W2131146434","https://openalex.org/W2951359407","https://openalex.org/W1964286703","https://openalex.org/W2169866437","https://openalex.org/W3056417032"],"abstract_inverted_index":{"Audio-driven":[0],"talking-face":[1],"synthesis":[2],"is":[3,146],"of":[4,98,104,123,194,238],"significant":[5],"importance":[6],"in":[7,22,141],"various":[8],"application":[9],"scenarios,":[10],"such":[11],"as":[12],"remote":[13],"meetings,":[14],"AR/VR,":[15],"and":[16,35,65,100,130,179,188,246,275,279,303],"digital":[17],"humans.":[18],"Currently,":[19],"the":[20,68,96,107,124,153,168,185,214,219,242,247,257,266,270,288,292,296,306],"work":[21],"this":[23,159,161,197,318],"field":[24],"can":[25,85,320],"be":[26],"broadly":[27],"divided":[28],"into":[29,204,273],"two":[30,239],"categories:":[31],"implicit":[32,105],"representation-based":[33,37,40,116],"methods":[34,41,117,137],"explicit":[36],"methods.":[38,114],"Implicit":[39],"often":[42],"use":[43,119],"neural":[44],"network":[45],"models":[46],"to":[47,61,95,148,212,261,301,310],"represent":[48,62],"human":[49,63],"face.":[50],"For":[51],"example,":[52],"NeRF":[53,71],"(Neural":[54],"Radiance":[55],"Field)":[56],"uses":[57,256],"MLP":[58],"(Multi-Layer":[59],"Perceptron)":[60],"face,":[64,125],"by":[66,218,222],"inputting":[67],"camera":[69],"pose,":[70],"renders":[72],"a":[73,192,228,312],"face":[74,87],"image":[75,298],"from":[76,299],"that":[77,264,317],"viewpoint.":[78],"When":[79],"training":[80],"with":[81,89,167,175,191,291],"audio":[82,199,259,271],"features,":[83],"it":[84,145],"render":[86],"images":[88,308],"different":[90],"facial":[91,189,207,280],"movements.":[92,281],"However,":[93,144],"due":[94],"lack":[97],"editability":[99,129],"slow":[101],"rendering":[102,133,154,223,250,285],"speed":[103],"representations,":[106],"industry":[108],"has":[109],"been":[110],"exploring":[111],"new":[112],"representation":[113],"Explicit":[115],"commonly":[118],"3D":[120,164,169,215,229,248,283,289],"Gaussian":[121,165,249,284,290],"representations":[122,166],"which":[126,209],"offers":[127],"high":[128],"nearly":[131],"real-time":[132],"speed,":[134],"making":[135],"these":[136],"surpass":[138],"implicit-based":[139],"approaches":[140],"many":[142],"metrics.":[143],"difficult":[147],"incorporate":[149],"other":[150],"signals":[151,200],"during":[152],"optimization":[155],"process.":[156],"To":[157],"address":[158],"issue,":[160],"paper":[162,226],"combines":[163],"head":[170],"model":[171,183,216],"FLAME":[172,182,267,293],"(Faces":[173],"Learned":[174],"an":[176],"Articulated":[177],"Model":[178],"Expressions).":[180],"The":[181,252,282],"represents":[184],"face's":[186],"shape":[187],"movements":[190],"set":[193],"parameters.":[195],"In":[196],"paper,":[198],"are":[201,210],"first":[202],"transformed":[203],"parameters":[205,263],"representing":[206],"movements,":[208],"used":[211],"initialize":[213],"generated":[217,307],"parameters,":[220],"followed":[221],"optimization.":[224],"This":[225],"proposes":[227],"Gaussian-based":[230],"audio-driven":[231],"lip-sync":[232],"video":[233],"generation":[234],"system.":[235],"It":[236],"consists":[237],"main":[240],"modules:":[241],"audio-to-facial":[243,253],"movement":[244,254],"module":[245,255,286],"module.":[251],"extracted":[258],"features":[260],"generate":[262,321],"control":[265],"model,":[268,294],"converting":[269],"input":[272],"natural":[274],"reasonable":[276],"lip":[277],"shapes":[278],"initializes":[287],"optimizes":[295],"rendered":[297],"coarse":[300],"fine,":[302],"finally":[304],"stitches":[305],"together":[309],"create":[311],"continuous":[313],"video.":[314],"Experiments":[315],"show":[316],"system":[319],"lip-synchronized,":[322],"accurate,":[323],"high-fidelity":[324],"videos.":[325]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
