{"id":"https://openalex.org/W4411624776","doi":"https://doi.org/10.1145/3703323.3703325","title":"Audio-Visual Speech Synthesis Leveraging Capsule-Enhanced Generative Adversarial Network","display_name":"Audio-Visual Speech Synthesis Leveraging Capsule-Enhanced Generative Adversarial Network","publication_year":2024,"publication_date":"2024-12-18","ids":{"openalex":"https://openalex.org/W4411624776","doi":"https://doi.org/10.1145/3703323.3703325"},"language":"en","primary_location":{"id":"doi:10.1145/3703323.3703325","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3703323.3703325","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3703323.3703325","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 8th International Conference on Data Science and Management of Data (12th ACM IKDD CODS and 30th COMAD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3703323.3703325","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103171265","display_name":"Subhayu Ghosh","orcid":"https://orcid.org/0009-0005-2538-6768"},"institutions":[{"id":"https://openalex.org/I155837530","display_name":"National Institute of Technology Durgapur","ror":"https://ror.org/04ds0jm32","country_code":"IN","type":"education","lineage":["https://openalex.org/I155837530"]}],"countries":["IN"],"is_corresponding":true,"raw_author_name":"Subhayu Ghosh","raw_affiliation_strings":["Computer Science and Engineering, National Institute of Technology Durgapur, Durgapur, West Bengal, IN"],"affiliations":[{"raw_affiliation_string":"Computer Science and Engineering, National Institute of Technology Durgapur, Durgapur, West Bengal, IN","institution_ids":["https://openalex.org/I155837530"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055489209","display_name":"Nanda Dulal Jana","orcid":"https://orcid.org/0000-0003-0631-9912"},"institutions":[{"id":"https://openalex.org/I155837530","display_name":"National Institute of Technology Durgapur","ror":"https://ror.org/04ds0jm32","country_code":"IN","type":"education","lineage":["https://openalex.org/I155837530"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Nanda Dulal Jana","raw_affiliation_strings":["Computer Science and Engineering, National Institute of Technology Durgapur, Durgapur, West Bengal, IN"],"affiliations":[{"raw_affiliation_string":"Computer Science and Engineering, National Institute of Technology Durgapur, Durgapur, West Bengal, IN","institution_ids":["https://openalex.org/I155837530"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5103171265"],"corresponding_institution_ids":["https://openalex.org/I155837530"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35030517,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"10","last_page":"18"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10688","display_name":"Image and Signal Denoising Methods","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.9934999942779541,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7822686433792114},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.7135148644447327},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.6096339821815491},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.6046266555786133},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5561147332191467},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.5185085535049438},{"id":"https://openalex.org/keywords/generative-adversarial-network","display_name":"Generative adversarial network","score":0.48565876483917236},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4276394546031952},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.40119606256484985},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3218708038330078},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.2465623915195465},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.17169135808944702}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7822686433792114},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.7135148644447327},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.6096339821815491},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.6046266555786133},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5561147332191467},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.5185085535049438},{"id":"https://openalex.org/C2988773926","wikidata":"https://www.wikidata.org/wiki/Q25104379","display_name":"Generative adversarial network","level":3,"score":0.48565876483917236},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4276394546031952},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.40119606256484985},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3218708038330078},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.2465623915195465},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.17169135808944702}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3703323.3703325","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3703323.3703325","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3703323.3703325","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 8th International Conference on Data Science and Management of Data (12th ACM IKDD CODS and 30th COMAD)","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3703323.3703325","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3703323.3703325","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3703323.3703325","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 8th International Conference on Data Science and Management of Data (12th ACM IKDD CODS and 30th COMAD)","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4411624776.pdf","grobid_xml":"https://content.openalex.org/works/W4411624776.grobid-xml"},"referenced_works_count":43,"referenced_works":["https://openalex.org/W398859631","https://openalex.org/W1516807289","https://openalex.org/W1592940279","https://openalex.org/W1975163393","https://openalex.org/W2001936727","https://openalex.org/W2117535912","https://openalex.org/W2126143605","https://openalex.org/W2136166660","https://openalex.org/W2141769559","https://openalex.org/W2293049663","https://openalex.org/W2576309025","https://openalex.org/W2755577605","https://openalex.org/W2765811365","https://openalex.org/W2889061305","https://openalex.org/W2895526696","https://openalex.org/W2902070858","https://openalex.org/W2937020545","https://openalex.org/W2942211779","https://openalex.org/W2963009026","https://openalex.org/W2963539064","https://openalex.org/W2964011100","https://openalex.org/W2970737019","https://openalex.org/W2972667718","https://openalex.org/W2981087920","https://openalex.org/W3012498027","https://openalex.org/W3098557217","https://openalex.org/W3102628737","https://openalex.org/W3152148971","https://openalex.org/W3154807520","https://openalex.org/W3199527474","https://openalex.org/W3216759837","https://openalex.org/W4225272741","https://openalex.org/W4229081350","https://openalex.org/W4285212083","https://openalex.org/W4287660544","https://openalex.org/W4322629454","https://openalex.org/W4366085639","https://openalex.org/W4385497926","https://openalex.org/W4388854080","https://openalex.org/W4392411893","https://openalex.org/W4393231955","https://openalex.org/W4399924909","https://openalex.org/W4402351144"],"related_works":["https://openalex.org/W2502115930","https://openalex.org/W2482350142","https://openalex.org/W2888032422","https://openalex.org/W2996316059","https://openalex.org/W4410783345","https://openalex.org/W4377980832","https://openalex.org/W2897769091","https://openalex.org/W2845413374","https://openalex.org/W3005996785","https://openalex.org/W4235873501"],"abstract_inverted_index":{"Audio-visual":[0],"speech":[1,24,40,76,151],"synthesis":[2,66],"(AVSS)":[3],"is":[4,162],"a":[5,26,34,70,128,133,141],"rapidly":[6],"gaining":[7],"prominence":[8],"in":[9],"artificial":[10],"intelligence":[11],"(AI)":[12],"due":[13],"to":[14,20,60,68,110,146],"its":[15],"widespread":[16],"applications.":[17],"AVSS":[18,47,93,104,130],"refers":[19],"generating":[21],"the":[22,30,39,46,58,61,74,78,83,90,112,115,121,172,176,179,183],"audio-visual":[23,65,157],"of":[25,33,45,77,86,92,114,178],"target":[27,62,79],"speaker":[28,63],"from":[29,57,73],"audio":[31],"input":[32,150],"source":[35,59],"speaker,":[36],"while":[37],"preserving":[38],"content.":[41],"The":[42,97,159],"sequential":[43],"steps":[44],"process":[48],"include":[49],"voice":[50],"conversion":[51],"(VC)":[52],"for":[53],"altering":[54],"vocal":[55,116],"characteristics":[56],"and":[64,118,153,164,168,171],"(AVS)":[67],"generate":[69],"video":[71,123],"stream":[72],"converted":[75],"speaker.":[80],"However,":[81],"with":[82,140],"progressive":[84],"development":[85],"deep":[87],"generative":[88,134],"models,":[89,105],"domain":[91],"remains":[94],"relatively":[95],"unexplored.":[96],"existing":[98],"literature":[99],"predominantly":[100],"focuses":[101],"on":[102,166],"autoencoder-based":[103],"potentially":[106],"limiting":[107],"their":[108],"ability":[109],"capture":[111,148],"diversity":[113],"features":[117,152],"also":[119],"synthesize":[120],"realistic":[122],"output.":[124,158],"This":[125],"paper":[126],"introduces":[127],"novel":[129],"approach":[131,181],"utilizing":[132],"adversarial":[135],"network":[136,143],"(GAN)":[137],"model,":[138],"enriched":[139],"capsule":[142],"(Caps-Net)":[144],"architecture":[145],"effectively":[147],"relevant":[149],"produce":[154],"more":[155],"natural":[156],"proposed":[160,180],"framework":[161],"trained":[163],"tested":[165],"VoxCeleb2":[167],"LRS3-TED":[169],"datasets":[170],"experimental":[173],"outcomes":[174],"exhibit":[175],"superiority":[177],"over":[182],"state-of-the-art":[184],"(SOTA)":[185],"models.":[186]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
