{"id":"https://openalex.org/W7127322936","doi":"https://doi.org/10.48550/arxiv.2602.01908","title":"LipSody: Lip-to-Speech Synthesis with Enhanced Prosody Consistency","display_name":"LipSody: Lip-to-Speech Synthesis with Enhanced Prosody Consistency","publication_year":2026,"publication_date":"2026-02-02","ids":{"openalex":"https://openalex.org/W7127322936","doi":"https://doi.org/10.48550/arxiv.2602.01908"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.01908","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124948645","display_name":"Jaejun Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Jaejun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020776717","display_name":"Yoori Oh","orcid":"https://orcid.org/0009-0002-2557-0934"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Oh, Yoori","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124882834","display_name":"Kyogu Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Kyogu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8352000117301941,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8352000117301941,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.12290000170469284,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.014499999582767487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.8183000087738037},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5202999711036682},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.51419997215271},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.48429998755455017},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.44769999384880066}],"concepts":[{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.8183000087738037},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6542999744415283},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6396999955177307},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5202999711036682},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.51419997215271},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.48429998755455017},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.44769999384880066},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4399999976158142},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.40639999508857727},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36469998955726624},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.3352000117301941},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.31349998712539673},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.2858000099658966}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.01908","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.01908","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.01908","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.01908","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Lip-to-speech":[0],"synthesis":[1],"aims":[2],"to":[3,114],"generate":[4],"speech":[5],"audio":[6,25],"directly":[7],"from":[8,16,77,83,90],"silent":[9],"facial":[10,78],"video":[11],"by":[12],"reconstructing":[13,43],"linguistic":[14,44,80],"content":[15,81],"lip":[17,84],"movements,":[18,85],"providing":[19],"valuable":[20],"applications":[21],"in":[22,42],"situations":[23],"where":[24],"signals":[26],"are":[27],"unavailable":[28],"or":[29],"degraded.":[30],"While":[31],"recent":[32],"diffusion-based":[33],"models":[34],"such":[35],"as":[36],"LipVoicer":[37],"have":[38],"demonstrated":[39],"impressive":[40],"performance":[41],"content,":[45],"they":[46],"often":[47],"lack":[48],"prosodic":[49],"consistency.":[50,63],"In":[51],"this":[52],"work,":[53],"we":[54],"propose":[55],"LipSody,":[56],"a":[57,66],"lip-to-speech":[58],"framework":[59],"enhanced":[60],"for":[61],"prosody":[62],"LipSody":[64,97],"introduces":[65],"prosody-guiding":[67],"strategy":[68],"that":[69,96],"leverages":[70],"three":[71],"complementary":[72],"cues:":[73],"speaker":[74,111],"identity":[75],"extracted":[76],"images,":[79],"derived":[82],"and":[86,104,110],"emotional":[87],"context":[88],"inferred":[89],"face":[91],"video.":[92],"Experimental":[93],"results":[94],"demonstrate":[95],"substantially":[98],"improves":[99],"prosody-related":[100],"metrics,":[101],"including":[102],"global":[103],"local":[105],"pitch":[106],"deviations,":[107],"energy":[108],"consistency,":[109],"similarity,":[112],"compared":[113],"prior":[115],"approaches.":[116]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-04T00:00:00"}
