{"id":"https://openalex.org/W7134836196","doi":"https://doi.org/10.48550/arxiv.2603.08674","title":"Talking Together: Synthesizing Co-Located 3D Conversations from Audio","display_name":"Talking Together: Synthesizing Co-Located 3D Conversations from Audio","publication_year":2026,"publication_date":"2026-03-09","ids":{"openalex":"https://openalex.org/W7134836196","doi":"https://doi.org/10.48550/arxiv.2603.08674"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.08674","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5011335725","display_name":"Mengyi Shan","orcid":"https://orcid.org/0000-0002-1520-5979"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Mengyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128677456","display_name":"Shouchieh Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chang, Shouchieh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027631030","display_name":"Ziqian Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Ziqian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128679171","display_name":"Shichen Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Shichen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128651036","display_name":"Yinda Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yinda","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128682472","display_name":"Luchuan Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Luchuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128683235","display_name":"Rohit Pandey","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pandey, Rohit","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056677910","display_name":"Sean Fanello","orcid":"https://orcid.org/0000-0001-9726-4501"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fanello, Sean","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128644877","display_name":"Zeng Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Zeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.7210000157356262,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.7210000157356262,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.062199998646974564,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.057100001722574234,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/gaze","display_name":"Gaze","score":0.6660000085830688},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.60589998960495},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5892000198364258},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4390999972820282},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.3971000015735626},{"id":"https://openalex.org/keywords/expressive-power","display_name":"Expressive power","score":0.37139999866485596},{"id":"https://openalex.org/keywords/animation","display_name":"Animation","score":0.3684999942779541},{"id":"https://openalex.org/keywords/power","display_name":"Power (physics)","score":0.3564000129699707}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8295000195503235},{"id":"https://openalex.org/C2779916870","wikidata":"https://www.wikidata.org/wiki/Q14467155","display_name":"Gaze","level":2,"score":0.6660000085830688},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.60589998960495},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5892000198364258},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.569599986076355},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4390999972820282},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3971000015735626},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3910999894142151},{"id":"https://openalex.org/C195818886","wikidata":"https://www.wikidata.org/wiki/Q5421724","display_name":"Expressive power","level":2,"score":0.37139999866485596},{"id":"https://openalex.org/C502989409","wikidata":"https://www.wikidata.org/wiki/Q11425","display_name":"Animation","level":2,"score":0.3684999942779541},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.3564000129699707},{"id":"https://openalex.org/C543847140","wikidata":"https://www.wikidata.org/wiki/Q2642826","display_name":"Realism","level":2,"score":0.32440000772476196},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.31049999594688416},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2919999957084656},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.28209999203681946},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.2782000005245209},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.26980000734329224},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.25839999318122864},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.2533999979496002}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.08674","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.08674","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.08674","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.08674","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.5320584774017334}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,106],"tackle":[1],"the":[2,38,43,67,118,123],"challenging":[3],"task":[4],"of":[5,70,156],"generating":[6],"complete":[7],"3D":[8,45],"facial":[9],"animations":[10,174],"for":[11,60,102,176],"two":[12],"interacting,":[13],"co-located":[14],"participants":[15],"from":[16,162],"a":[17,31,94,128,146,151],"mixed":[18,119],"audio":[19,120],"stream.":[20],"While":[21],"existing":[22,185],"methods":[23],"often":[24],"produce":[25],"disembodied":[26],"\"talking":[27],"heads\"":[28],"akin":[29],"to":[30,40,83,116,133,149],"video":[32],"conference":[33],"call,":[34],"our":[35,141],"work":[36],"is":[37,58,100],"first":[39],"explicitly":[41],"model":[42,122],"dynamic":[44],"spatial":[46],"relationship":[47],"--":[48,56],"including":[49,73],"relative":[50,80],"position,":[51],"orientation,":[52],"and":[53,76,111,121,170,181,190],"mutual":[54,136],"gaze":[55,131],"that":[57],"crucial":[59],"realistic":[61],"in-person":[62],"dialogues.":[63],"Our":[64,165],"system":[65],"synthesizes":[66],"full":[68],"performance":[69],"both":[71],"individuals,":[72],"precise":[74],"lip-sync,":[75],"uniquely":[77],"allows":[78],"their":[79],"head":[81],"poses":[82],"be":[84],"controlled":[85],"via":[86],"textual":[87],"descriptions.":[88],"To":[89,139],"achieve":[90],"this,":[91],"we":[92,126,144],"propose":[93],"dual-stream":[95],"architecture":[96],"where":[97],"each":[98],"stream":[99],"responsible":[101],"one":[103],"participant's":[104],"output.":[105],"employ":[107],"speaker's":[108],"role":[109],"embeddings":[110],"inter-speaker":[112],"cross-attention":[113],"mechanisms":[114],"designed":[115],"disentangle":[117],"interaction.":[124],"Furthermore,":[125],"introduce":[127,145],"novel":[129,147],"eye":[130,137],"loss":[132],"promote":[134],"natural,":[135],"contact.":[138],"power":[140],"data-hungry":[142],"approach,":[143],"pipeline":[148],"curate":[150],"large-scale":[152],"conversational":[153],"dataset":[154],"consisting":[155],"over":[157],"2":[158],"million":[159],"dyadic":[160,173],"pairs":[161],"in-the-wild":[163],"videos.":[164],"method":[166],"generates":[167],"fluid,":[168],"controllable,":[169],"spatially":[171],"aware":[172],"suitable":[175],"immersive":[177],"applications":[178],"in":[179,187],"VR":[180],"telepresence,":[182],"significantly":[183],"outperforming":[184],"baselines":[186],"perceived":[188],"realism":[189],"interaction":[191],"coherence.":[192]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-11T00:00:00"}
