{"id":"https://openalex.org/W7156847872","doi":"https://doi.org/10.48550/arxiv.2604.23632","title":"Hallo-Live: Real-Time Streaming Joint Audio-Video Avatar Generation with Asynchronous Dual-Stream and Human-Centric Preference Distillation","display_name":"Hallo-Live: Real-Time Streaming Joint Audio-Video Avatar Generation with Asynchronous Dual-Stream and Human-Centric Preference Distillation","publication_year":2026,"publication_date":"2026-04-26","ids":{"openalex":"https://openalex.org/W7156847872","doi":"https://doi.org/10.48550/arxiv.2604.23632"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.23632","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23632","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.23632","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134808363","display_name":"Chunyu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Chunyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134818524","display_name":"Jiaye Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jiaye","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134756407","display_name":"Ruiqiao Mei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mei, Ruiqiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134771247","display_name":"Haoyuan Xia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xia, Haoyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134760784","display_name":"hao zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Hao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134775823","display_name":"Jingdong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jingdong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134785838","display_name":"Siyu Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Siyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5134808363"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.24959999322891235,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.24959999322891235,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.20319999754428864,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.13830000162124634,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.5867000222206116},{"id":"https://openalex.org/keywords/avatar","display_name":"Avatar","score":0.47130000591278076},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.46299999952316284},{"id":"https://openalex.org/keywords/asynchrony","display_name":"Asynchrony (computer programming)","score":0.41350001096725464},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.4058000147342682},{"id":"https://openalex.org/keywords/low-latency","display_name":"Low latency (capital markets)","score":0.36629998683929443},{"id":"https://openalex.org/keywords/sync","display_name":"sync","score":0.3237000107765198}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.838699996471405},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.5867000222206116},{"id":"https://openalex.org/C2777365542","wikidata":"https://www.wikidata.org/wiki/Q83090","display_name":"Avatar","level":2,"score":0.47130000591278076},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.46299999952316284},{"id":"https://openalex.org/C2779019669","wikidata":"https://www.wikidata.org/wiki/Q25203946","display_name":"Asynchrony (computer programming)","level":3,"score":0.41350001096725464},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.4058000147342682},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.36629998683929443},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3662000000476837},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.35659998655319214},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3476000130176544},{"id":"https://openalex.org/C3913047","wikidata":"https://www.wikidata.org/wiki/Q1956265","display_name":"sync","level":3,"score":0.3237000107765198},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.31520000100135803},{"id":"https://openalex.org/C71901391","wikidata":"https://www.wikidata.org/wiki/Q7126699","display_name":"Upload","level":2,"score":0.3109999895095825},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.29600000381469727},{"id":"https://openalex.org/C2778090530","wikidata":"https://www.wikidata.org/wiki/Q2523931","display_name":"Viewport","level":2,"score":0.29589998722076416},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.2824999988079071},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.27250000834465027},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.2702000141143799},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C2778707766","wikidata":"https://www.wikidata.org/wiki/Q202064","display_name":"Phone","level":2,"score":0.2549000084400177}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.23632","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23632","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.23632","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23632","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Real-time":[0],"text-driven":[1,202],"joint":[2,44],"audio-video":[3],"avatar":[4,46],"generation":[5,47,148],"requires":[6],"jointly":[7],"synthesizing":[8],"portrait":[9],"video":[10,71],"and":[11,16,30,112,133,155,178],"speech":[12,110],"with":[13,53,78,125,197],"high":[14],"fidelity":[15],"precise":[17],"synchronization,":[18],"yet":[19],"existing":[20],"audio-visual":[21,45,113,203],"diffusion":[22,52,196],"models":[23],"remain":[24],"too":[25],"slow":[26],"for":[27,43,200],"interactive":[28],"use":[29],"often":[31],"degrade":[32],"noticeably":[33],"after":[34],"aggressive":[35],"acceleration.":[36],"We":[37],"present":[38],"Hallo-Live,":[39],"a":[40,79],"streaming":[41,194],"framework":[42,191],"that":[48],"combines":[49],"asynchronous":[50],"dual-stream":[51,195],"human-centric":[54],"preference-guided":[55,198],"distillation.":[56],"To":[57,86,181],"reduce":[58],"articulation":[59],"lag":[60],"in":[61,164],"causal":[62],"generation,":[63],"we":[64,94],"introduce":[65],"Future-Expanding":[66],"Attention,":[67],"which":[68,101],"allows":[69],"each":[70],"block":[72],"to":[73,192],"access":[74],"synchronous":[75],"audio":[76],"together":[77],"short":[80],"horizon":[81],"of":[82,91,184],"future":[83],"phonetic":[84],"cues.":[85],"mitigate":[87],"the":[88,138,165,182,189],"quality":[89],"loss":[90],"few-step":[92],"distillation,":[93],"further":[95,171],"propose":[96],"Human-Centric":[97],"Preference-Guided":[98],"DMD":[99],"(HP-DMD),":[100],"reweights":[102],"training":[103],"samples":[104],"using":[105],"rewards":[106],"from":[107],"visual":[108],"fidelity,":[109],"naturalness,":[111],"synchronization.":[114],"On":[115],"two":[116],"NVIDIA":[117],"H200":[118],"GPUs,":[119],"Hallo-Live":[120,187],"runs":[121],"at":[122],"20.38":[123],"FPS":[124],"0.94":[126],"seconds":[127],"latency,":[128],"yielding":[129],"16.0x":[130],"higher":[131],"throughput":[132],"99.3x":[134],"lower":[135],"latency":[136],"than":[137],"teacher":[139],"model":[140],"Ovi.":[141],"Despite":[142],"this":[143],"speedup,":[144],"it":[145],"retains":[146],"strong":[147],"quality,":[149],"reaching":[150],"comparable":[151],"VideoAlign":[152],"overall":[153,166],"score":[154,158],"Sync":[156],"Confidence":[157],"while":[159],"outperforming":[160],"other":[161],"accelerated":[162],"baselines":[163],"quality-efficiency":[167],"trade-off.":[168],"Qualitative":[169],"results":[170],"show":[172],"robust":[173],"generalization":[174],"across":[175],"photorealistic,":[176],"multi-speaker,":[177],"stylized":[179],"scenarios.":[180],"best":[183],"our":[185],"knowledge,":[186],"is":[188],"first":[190],"combine":[193],"distillation":[199],"real-time,":[201],"generation.":[204]},"counts_by_year":[],"updated_date":"2026-04-29T06:16:36.941037","created_date":"2026-04-29T00:00:00"}
