{"id":"https://openalex.org/W4411635493","doi":"https://doi.org/10.1145/3731715.3733344","title":"FREAK: Frequency-modulated High-fidelity and Real-time Audio-driven Talking Portrait Synthesis","display_name":"FREAK: Frequency-modulated High-fidelity and Real-time Audio-driven Talking Portrait Synthesis","publication_year":2025,"publication_date":"2025-06-25","ids":{"openalex":"https://openalex.org/W4411635493","doi":"https://doi.org/10.1145/3731715.3733344"},"language":"en","primary_location":{"id":"doi:10.1145/3731715.3733344","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3731715.3733344","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5118627647","display_name":"Ziqi Ni","orcid":null},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ziqi Ni","raw_affiliation_strings":["School of Computer Science and Engineering, Southeast University, Nanjing, China and Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications, Ministry of Education, China, Nanjing, China"],"raw_orcid":"https://orcid.org/0009-0004-5760-5665","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Southeast University, Nanjing, China and Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications, Ministry of Education, China, Nanjing, China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5118627648","display_name":"Ao Fu","orcid":null},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ao Fu","raw_affiliation_strings":["School of Computer Science and Engineering, Southeast University, Nanjing, China and Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications, Ministry of Education, China, Nanjing, China"],"raw_orcid":"https://orcid.org/0009-0006-9933-9240","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Southeast University, Nanjing, China and Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications, Ministry of Education, China, Nanjing, China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100734971","display_name":"Yi Zhou","orcid":"https://orcid.org/0000-0003-3021-3229"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Zhou","raw_affiliation_strings":["School of Computer Science and Engineering, Southeast University, Nanjing, China and Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications, Ministry of Education, China, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0003-3021-3229","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Southeast University, Nanjing, China and Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications, Ministry of Education, China, Nanjing, China","institution_ids":["https://openalex.org/I76569877"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.15081089,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1036","last_page":"1044"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/freak","display_name":"FREAK","score":0.725502610206604},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6280086636543274},{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.5637840032577515},{"id":"https://openalex.org/keywords/portrait","display_name":"Portrait","score":0.5612162351608276},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.5342456698417664},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3561787009239197},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.19635233283042908},{"id":"https://openalex.org/keywords/art","display_name":"Art","score":0.1802196502685547},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.1364506185054779},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.12127411365509033},{"id":"https://openalex.org/keywords/art-history","display_name":"Art history","score":0.09086456894874573},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.07440930604934692}],"concepts":[{"id":"https://openalex.org/C2776717989","wikidata":"https://www.wikidata.org/wiki/Q19410276","display_name":"FREAK","level":2,"score":0.725502610206604},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6280086636543274},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.5637840032577515},{"id":"https://openalex.org/C162462552","wikidata":"https://www.wikidata.org/wiki/Q134307","display_name":"Portrait","level":2,"score":0.5612162351608276},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.5342456698417664},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3561787009239197},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.19635233283042908},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.1802196502685547},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.1364506185054779},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.12127411365509033},{"id":"https://openalex.org/C52119013","wikidata":"https://www.wikidata.org/wiki/Q50637","display_name":"Art history","level":1,"score":0.09086456894874573},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.07440930604934692}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3731715.3733344","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3731715.3733344","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W2331128040","https://openalex.org/W2604379605","https://openalex.org/W2963081548","https://openalex.org/W2964449965","https://openalex.org/W3019952993","https://openalex.org/W3081492798","https://openalex.org/W3109114891","https://openalex.org/W3154326567","https://openalex.org/W3203605797","https://openalex.org/W3213601271","https://openalex.org/W4200150166","https://openalex.org/W4200174933","https://openalex.org/W4210657261","https://openalex.org/W4304080852","https://openalex.org/W4311137635","https://openalex.org/W4312990833","https://openalex.org/W4382240211","https://openalex.org/W4385682917","https://openalex.org/W4385819777","https://openalex.org/W4386065999","https://openalex.org/W4386066256","https://openalex.org/W4386071531","https://openalex.org/W4386072021","https://openalex.org/W4387698237","https://openalex.org/W4390874458","https://openalex.org/W4393156599","https://openalex.org/W4394597155","https://openalex.org/W4394597549","https://openalex.org/W4402727061","https://openalex.org/W4402916679","https://openalex.org/W4403791312","https://openalex.org/W4404199654","https://openalex.org/W6797578546"],"related_works":["https://openalex.org/W4313443006","https://openalex.org/W2945374968","https://openalex.org/W4293777179","https://openalex.org/W4385452045","https://openalex.org/W2164070813","https://openalex.org/W2135608140","https://openalex.org/W2895525995","https://openalex.org/W2332512904","https://openalex.org/W4224231624","https://openalex.org/W2319626700"],"abstract_inverted_index":{"Achieving":[0],"high-fidelity":[1,238],"and":[2,76,98,119,152,162,165,185,198,209,228,245],"accurate":[3],"lip-speech":[4],"synchronization":[5,248],"in":[6,56,67,143,156,181,194,249],"audio-driven":[7],"talking":[8,24,74,83,109,179,239],"portrait":[9,84,102],"synthesis":[10,85,103],"is":[11],"particularly":[12],"challenging.":[13],"Some":[14,33],"studies":[15],"utilize":[16],"multi-stage":[17],"pipelines":[18],"or":[19],"diffusion":[20],"models":[21,108],"for":[22],"high-quality":[23],"portraits;":[25],"however,":[26],"they":[27],"suffer":[28],"from":[29,111],"excessive":[30],"computational":[31],"costs.":[32],"approaches":[34],"achieve":[35],"remarkable":[36],"progress":[37],"on":[38,82],"specific":[39],"individuals":[40],"with":[41,241],"low":[42],"resource":[43],"requirements,":[44],"yet":[45],"still":[46],"exhibit":[47],"mismatched":[48],"lip":[49,247],"movements.":[50],"The":[51],"aforementioned":[52],"methods":[53],"are":[54,64],"modeled":[55],"the":[57,68,72,112,117,122,132,144,154,157,167,175,178,182,192],"pixel":[58,196],"domain.":[59],"We":[60],"observed":[61],"that":[62,234],"there":[63],"noticeable":[65],"discrepancies":[66],"frequency":[69,113,145,150,158,183,199],"domain":[70,114,184,197,200],"between":[71,160,207],"synthesized":[73,123,161],"videos":[75],"natural":[77,163],"videos.":[78],"Currently,":[79],"no":[80],"research":[81],"has":[86],"considered":[87],"this":[88],"aspect.":[89],"To":[90],"address":[91],"this,":[92],"we":[93,190],"propose":[94],"a":[95],"FREquency-modulated,":[96],"high-fidelity,":[97],"real-time":[99,229],"Audio-driven":[100],"talKing":[101],"framework,":[104],"named":[105],"FREAK,":[106],"which":[107],"portraits":[110,240],"perspective,":[115],"enhancing":[116],"fidelity":[118],"naturalness":[120],"of":[121],"portraits.":[124],"FREAK":[125,203],"introduces":[126],"two":[127],"novel":[128],"frequency-based":[129],"modules:":[130],"1)":[131],"Visual":[133,169],"Encoding":[134],"Frequency":[135,170],"Modulator":[136,171],"(VEFM)":[137],"to":[138,173,217],"couple":[139],"multi-scale":[140],"visual":[141,149],"features":[142],"domain,":[146],"better":[147],"preserving":[148],"information":[151],"reducing":[153],"gap":[155],"spectrum":[159],"frames.":[164],"2)":[166],"Audio":[168],"(AVFM)":[172],"help":[174],"model":[176,193],"learn":[177],"pattern":[180],"improve":[186],"audio-visual":[187],"synchronization.":[188],"Additionally,":[189],"optimize":[191],"both":[195],"jointly.":[201],"Furthermore,":[202],"supports":[204],"seamless":[205],"switching":[206],"one-shot":[208],"video":[210,226],"dubbing":[211],"settings,":[212],"offering":[213],"enhanced":[214],"flexibility.":[215],"Due":[216],"its":[218],"superior":[219],"performance,":[220],"it":[221],"can":[222],"simultaneously":[223],"support":[224],"high-resolution":[225],"results":[227],"inference.":[230],"Extensive":[231],"experiments":[232],"demonstrate":[233],"our":[235],"method":[236],"synthesizes":[237],"detailed":[242],"facial":[243],"textures":[244],"precise":[246],"real-time,":[250],"outperforming":[251],"state-of-the-art":[252],"methods.":[253]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
