{"id":"https://openalex.org/W4414539781","doi":"https://doi.org/10.23919/mva65244.2025.11175056","title":"Low-Latency Real-Time Audio-Driven Talking Head Generation Based on Future Speech Feature Prediction","display_name":"Low-Latency Real-Time Audio-Driven Talking Head Generation Based on Future Speech Feature Prediction","publication_year":2025,"publication_date":"2025-07-26","ids":{"openalex":"https://openalex.org/W4414539781","doi":"https://doi.org/10.23919/mva65244.2025.11175056"},"language":"en","primary_location":{"id":"doi:10.23919/mva65244.2025.11175056","is_oa":false,"landing_page_url":"https://doi.org/10.23919/mva65244.2025.11175056","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 19th International Conference on Machine Vision and Applications (MVA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069258336","display_name":"Weijie Guo","orcid":"https://orcid.org/0000-0001-9635-3342"},"institutions":[{"id":"https://openalex.org/I131231118","display_name":"Aoyama Gakuin University","ror":"https://ror.org/002rw7y37","country_code":"JP","type":"education","lineage":["https://openalex.org/I131231118"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Weijie Guo","raw_affiliation_strings":["Aoyama Gakuin University,Tokyo,Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Aoyama Gakuin University,Tokyo,Japan","institution_ids":["https://openalex.org/I131231118"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011769227","display_name":"Naoshi Kaneko","orcid":"https://orcid.org/0000-0002-5638-2509"},"institutions":[{"id":"https://openalex.org/I165522056","display_name":"Tokyo Denki University","ror":"https://ror.org/01pa62v70","country_code":"JP","type":"education","lineage":["https://openalex.org/I165522056"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Naoshi Kaneko","raw_affiliation_strings":["Tokyo Denki University,Tokyo,Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tokyo Denki University,Tokyo,Japan","institution_ids":["https://openalex.org/I165522056"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5112591377","display_name":"Yoshiaki Akazawa","orcid":null},"institutions":[{"id":"https://openalex.org/I4210114283","display_name":"Kyocera (Japan)","ror":"https://ror.org/025y1g718","country_code":"JP","type":"company","lineage":["https://openalex.org/I4210114283"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yoshiaki Akazawa","raw_affiliation_strings":["KYOCERA Corporation,Kanagawa,Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"KYOCERA Corporation,Kanagawa,Japan","institution_ids":["https://openalex.org/I4210114283"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.12294635,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12260","display_name":"Educational Technology and Pedagogy","score":0.6198999881744385,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12260","display_name":"Educational Technology and Pedagogy","score":0.6198999881744385,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.5989000201225281,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13003","display_name":"Regional Development and Environment","score":0.5837000012397766,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.7328000068664551},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5841000080108643},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5529000163078308},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.4542999863624573},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.41690000891685486},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.4153999984264374},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4009000062942505},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.39160001277923584}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8255000114440918},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.7328000068664551},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6819999814033508},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5841000080108643},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5529000163078308},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4706999957561493},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.4542999863624573},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.41690000891685486},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.4153999984264374},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4009000062942505},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.39160001277923584},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3635999858379364},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3472000062465668},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.3260999917984009},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.30809998512268066},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.3077999949455261},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.2644999921321869},{"id":"https://openalex.org/C88485024","wikidata":"https://www.wikidata.org/wiki/Q1054571","display_name":"Cepstrum","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.25999999046325684},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25619998574256897},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.23919/mva65244.2025.11175056","is_oa":false,"landing_page_url":"https://doi.org/10.23919/mva65244.2025.11175056","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 19th International Conference on Machine Vision and Applications (MVA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1554803342","https://openalex.org/W1975270274","https://openalex.org/W2295661697","https://openalex.org/W2950635152","https://openalex.org/W2962785568","https://openalex.org/W2964559396","https://openalex.org/W2981767644","https://openalex.org/W3016011332","https://openalex.org/W3081492798","https://openalex.org/W3109585842","https://openalex.org/W3211147706","https://openalex.org/W4200174933","https://openalex.org/W4281730245","https://openalex.org/W4385483513","https://openalex.org/W4390872116","https://openalex.org/W4411049949"],"related_works":[],"abstract_inverted_index":{"This":[0],"research":[1],"proposes":[2],"a":[3],"low-latency":[4],"real-time":[5,27,128],"audio-driven":[6],"talking":[7],"head":[8],"generation":[9,38,54],"(THG)":[10],"method":[11,105],"based":[12],"on":[13],"future":[14,33,45,92,123],"speech":[15,46,80,93,124],"feature":[16,94,125],"prediction.":[17],"Most":[18],"traditional":[19],"methods":[20],"often":[21],"suffer":[22],"from":[23,48],"high":[24,86,136],"latency,":[25],"hindering":[26],"interaction":[28],"because":[29],"they":[30],"inevitably":[31],"require":[32],"audio":[34],"context":[35],"to":[36,127],"maintain":[37],"quality.":[39,112,138],"To":[40],"address":[41],"this,":[42],"we":[43],"predict":[44],"frames":[47],"mel":[49],"spectrogram":[50],"features,":[51],"enabling":[52],"the":[53,66,85,90,120],"of":[55,89,116,122],"facial":[56,74],"animations":[57],"in":[58],"advance":[59],"and":[60],"reducing":[61,131],"latency.":[62],"Our":[63],"approach":[64],"integrates":[65],"Neural":[67],"Radiance":[68],"Field":[69],"(NeRF)":[70],"model":[71],"for":[72],"high-fidelity":[73],"animation,":[75],"ensuring":[76],"precise":[77],"synchronization":[78,132],"with":[79,99],"input.":[81],"Extensive":[82],"experiments":[83],"demonstrate":[84],"prediction":[87,95,126],"accuracy":[88],"proposed":[91],"module.":[96],"Furthermore,":[97],"comparisons":[98],"previous":[100],"works":[101],"confirm":[102],"that":[103],"our":[104],"significantly":[106],"reduces":[107],"latency":[108,133],"while":[109,134],"improving":[110],"video":[111,137],"The":[113],"key":[114],"contribution":[115],"this":[117],"work":[118],"is":[119],"introduction":[121],"THG,":[129],"effectively":[130],"maintaining":[135]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
