{"id":"https://openalex.org/W7154201694","doi":"https://doi.org/10.48550/arxiv.2604.10413","title":"Sign-to-Speech Prosody Transfer via Sign Reconstruction-based GAN","display_name":"Sign-to-Speech Prosody Transfer via Sign Reconstruction-based GAN","publication_year":2026,"publication_date":"2026-04-12","ids":{"openalex":"https://openalex.org/W7154201694","doi":"https://doi.org/10.48550/arxiv.2604.10413"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.10413","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10413","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.10413","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Manabe, Toranosuke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Manabe, Toranosuke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103148125","display_name":"Yuto Shibata","orcid":"https://orcid.org/0009-0003-0517-5600"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shibata, Yuto","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133601741","display_name":"Shinnosuke Takamichi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Takamichi, Shinnosuke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5070908826","display_name":"Yoshimitsu Aoki","orcid":"https://orcid.org/0000-0001-7361-0027"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aoki, Yoshimitsu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11398","display_name":"Hand Gesture Recognition Systems","score":0.921500027179718,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11398","display_name":"Hand Gesture Recognition Systems","score":0.921500027179718,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11285","display_name":"Hearing Impairment and Communication","score":0.01209999993443489,"subfield":{"id":"https://openalex.org/subfields/3204","display_name":"Developmental and Educational Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10789","display_name":"Interactive and Immersive Displays","score":0.006500000134110451,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.7211999893188477},{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.6682000160217285},{"id":"https://openalex.org/keywords/sign","display_name":"Sign (mathematics)","score":0.527899980545044},{"id":"https://openalex.org/keywords/sign-language","display_name":"Sign language","score":0.5274999737739563},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.5062999725341797},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.4781000018119812},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.46059998869895935},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.41029998660087585},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.40959998965263367}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7764000296592712},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.7211999893188477},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.6682000160217285},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5339000225067139},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5317999720573425},{"id":"https://openalex.org/C139676723","wikidata":"https://www.wikidata.org/wiki/Q1193832","display_name":"Sign (mathematics)","level":2,"score":0.527899980545044},{"id":"https://openalex.org/C522192633","wikidata":"https://www.wikidata.org/wiki/Q34228","display_name":"Sign language","level":2,"score":0.5274999737739563},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.5062999725341797},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.505299985408783},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.4781000018119812},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.46059998869895935},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.41029998660087585},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.40959998965263367},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.39739999175071716},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.35740000009536743},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.33230000734329224},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3255000114440918},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.31610000133514404},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.296099990606308},{"id":"https://openalex.org/C2776756274","wikidata":"https://www.wikidata.org/wiki/Q181767","display_name":"Stress (linguistics)","level":2,"score":0.2935999929904938},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.2833999991416931},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.27730000019073486},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.2619999945163727},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.2583000063896179},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.25440001487731934},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.2542000114917755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.10413","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10413","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.10413","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10413","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Deep":[0],"learning":[1,138],"models":[2,158],"have":[3],"improved":[4],"sign":[5,86,101,187,196],"language-to-text":[6],"translation":[7],"and":[8,34,88,102,111,139],"made":[9],"it":[10],"easier":[11],"for":[12,193],"non-signers":[13],"to":[14,28,78],"understand":[15],"signed":[16,30],"messages.":[17],"When":[18],"the":[19,53,62,80,113,152,161,167,174,183],"goal":[20],"is":[21,27,98],"spoken":[22],"communication,":[23],"a":[24,49,70,125,146],"naive":[25],"approach":[26],"convert":[29],"messages":[31],"into":[32,92,166],"text":[33,47],"then":[35],"synthesize":[36,178],"speech":[37,103,179],"via":[38],"Text-to-Speech":[39],"(TTS).":[40],"However,":[41],"this":[42,66],"two-stage":[43],"pipeline":[44],"inevitably":[45],"treat":[46],"as":[48],"bottleneck":[50],"representation,":[51],"causing":[52],"loss":[54],"of":[55,115,155,163,186],"rich":[56],"non-verbal":[57],"information":[58],"originally":[59],"conveyed":[60],"in":[61,85],"signing.":[63],"To":[64,119],"address":[65],"limitation,":[67],"we":[68,122,143],"propose":[69,144],"novel":[71],"task,":[72],"\\emph{Sign-to-Speech":[73],"Prosody":[74],"Transfer},":[75],"which":[76],"aims":[77],"capture":[79],"global":[81],"prosodic":[82],"nuances":[83],"expressed":[84],"language":[87,197],"directly":[89],"integrate":[90],"them":[91],"synthesized":[93,168],"speech.":[94,169],"A":[95],"major":[96],"challenge":[97],"that":[99,129,150,173,180],"aligning":[100],"requires":[104],"expert":[105],"knowledge,":[106],"making":[107],"annotation":[108],"extremely":[109],"costly":[110],"preventing":[112],"construction":[114],"large":[116],"parallel":[117],"corpora.":[118],"overcome":[120],"this,":[121],"introduce":[123],"\\emph{SignRecGAN},":[124],"scalable":[126],"training":[127],"framework":[128],"leverages":[130],"unimodal":[131],"datasets":[132],"without":[133],"cross-modal":[134],"annotations":[135],"through":[136],"adversarial":[137],"reconstruction":[140],"losses.":[141],"Furthermore,":[142],"\\emph{S2PFormer},":[145],"new":[147,191],"model":[148],"architecture":[149],"preserves":[151],"expressive":[153],"power":[154],"existing":[156],"TTS":[157],"while":[159],"enabling":[160],"injection":[162],"sign-derived":[164],"prosody":[165],"Extensive":[170],"experiments":[171],"demonstrate":[172],"proposed":[175],"method":[176],"can":[177],"faithfully":[181],"reflects":[182],"emotional":[184],"content":[185],"language,":[188],"thereby":[189],"opening":[190],"possibilities":[192],"more":[194],"natural":[195],"communication.":[198],"Our":[199],"code":[200],"will":[201],"be":[202],"available":[203],"upon":[204],"acceptance.":[205]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-15T00:00:00"}
