{"id":"https://openalex.org/W4414231386","doi":"https://doi.org/10.1109/ialp68296.2024.11156535","title":"Vaachika: A GAN-Based Neural Vocoder for Marathi Language","display_name":"Vaachika: A GAN-Based Neural Vocoder for Marathi Language","publication_year":2025,"publication_date":"2025-08-03","ids":{"openalex":"https://openalex.org/W4414231386","doi":"https://doi.org/10.1109/ialp68296.2024.11156535"},"language":"en","primary_location":{"id":"doi:10.1109/ialp68296.2024.11156535","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ialp68296.2024.11156535","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Asian Language Processing (IALP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114974321","display_name":"Ravindrakumar M. Purohit","orcid":null},"institutions":[{"id":"https://openalex.org/I98389781","display_name":"Dhirubhai Ambani Institute of Information and Communication Technology","ror":"https://ror.org/02d5b7g69","country_code":"IN","type":"education","lineage":["https://openalex.org/I98389781"]}],"countries":["IN"],"is_corresponding":true,"raw_author_name":"Ravindrakumar M. Purohit","raw_affiliation_strings":["Dhirubhai Ambani University (DAU),Speech Research Lab,Gandhinagar (GJ),India"],"affiliations":[{"raw_affiliation_string":"Dhirubhai Ambani University (DAU),Speech Research Lab,Gandhinagar (GJ),India","institution_ids":["https://openalex.org/I98389781"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049511943","display_name":"Siddharth Krishna Kumar","orcid":"https://orcid.org/0000-0003-3606-5060"},"institutions":[{"id":"https://openalex.org/I98389781","display_name":"Dhirubhai Ambani Institute of Information and Communication Technology","ror":"https://ror.org/02d5b7g69","country_code":"IN","type":"education","lineage":["https://openalex.org/I98389781"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Siddharth Kumar","raw_affiliation_strings":["Dhirubhai Ambani University (DAU),Speech Research Lab,Gandhinagar (GJ),India"],"affiliations":[{"raw_affiliation_string":"Dhirubhai Ambani University (DAU),Speech Research Lab,Gandhinagar (GJ),India","institution_ids":["https://openalex.org/I98389781"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043002276","display_name":"Hemant A. Patil","orcid":"https://orcid.org/0000-0002-4068-2005"},"institutions":[{"id":"https://openalex.org/I98389781","display_name":"Dhirubhai Ambani Institute of Information and Communication Technology","ror":"https://ror.org/02d5b7g69","country_code":"IN","type":"education","lineage":["https://openalex.org/I98389781"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Hemant A. Patil","raw_affiliation_strings":["Dhirubhai Ambani University (DAU),Speech Research Lab,Gandhinagar (GJ),India"],"affiliations":[{"raw_affiliation_string":"Dhirubhai Ambani University (DAU),Speech Research Lab,Gandhinagar (GJ),India","institution_ids":["https://openalex.org/I98389781"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5114974321"],"corresponding_institution_ids":["https://openalex.org/I98389781"],"apc_list":null,"apc_paid":null,"fwci":2.2283,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.90616415,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"129","last_page":"134"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9836999773979187,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9836999773979187,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.925000011920929,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mean-opinion-score","display_name":"Mean opinion score","score":0.6189000010490417},{"id":"https://openalex.org/keywords/marathi","display_name":"Marathi","score":0.4643000066280365},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.4471000134944916},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.3968000113964081},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.3815999925136566},{"id":"https://openalex.org/keywords/sound-quality","display_name":"Sound quality","score":0.37380000948905945},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.3727000057697296},{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.36640000343322754},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.35899999737739563}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7348999977111816},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6841999888420105},{"id":"https://openalex.org/C62897895","wikidata":"https://www.wikidata.org/wiki/Q1915482","display_name":"Mean opinion score","level":3,"score":0.6189000010490417},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4690999984741211},{"id":"https://openalex.org/C2776844415","wikidata":"https://www.wikidata.org/wiki/Q1571","display_name":"Marathi","level":2,"score":0.4643000066280365},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.4471000134944916},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.3968000113964081},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.3815999925136566},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.37380000948905945},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.3727000057697296},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.36640000343322754},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.35899999737739563},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.35839998722076416},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3506999909877777},{"id":"https://openalex.org/C2779581591","wikidata":"https://www.wikidata.org/wiki/Q36244","display_name":"Vowel","level":2,"score":0.3230000138282776},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.32030001282691956},{"id":"https://openalex.org/C126780896","wikidata":"https://www.wikidata.org/wiki/Q899871","display_name":"Distortion (music)","level":4,"score":0.3077999949455261},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3061000108718872},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.30309998989105225},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.30239999294281006},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.3010999858379364},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.29179999232292175},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.2822999954223633},{"id":"https://openalex.org/C2777639682","wikidata":"https://www.wikidata.org/wiki/Q225957","display_name":"Dysarthria","level":2,"score":0.2745000123977661},{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.2700999975204468},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2630000114440918},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.2563000023365021},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.25589999556541443},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ialp68296.2024.11156535","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ialp68296.2024.11156535","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Asian Language Processing (IALP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W1552314771","https://openalex.org/W2107860279","https://openalex.org/W2141998673","https://openalex.org/W2327501763","https://openalex.org/W2593414223","https://openalex.org/W2963073614","https://openalex.org/W2963300588","https://openalex.org/W3097828251","https://openalex.org/W3160326269","https://openalex.org/W3196475561","https://openalex.org/W3197273793","https://openalex.org/W4299627282"],"related_works":[],"abstract_inverted_index":{"Vaachika":[0,108,134],"Marathi":[1],"introduces":[2],"a":[3,14,18,82,112,122,138],"neural":[4],"vocoder":[5,21],"for":[6],"high-fidelity":[7],"speech":[8,26],"synthesis.":[9],"Our":[10],"proposed":[11,32,133],"model":[12,33,110,136],"presents":[13],"significant":[15,123],"enhancement":[16],"to":[17,43,65,73,96,160,174],"baseline":[19,127],"GAN-based":[20],"by":[22,158,171],"focusing":[23],"on":[24,114,144],"improving":[25,154],"quality":[27,156],"and":[28,48,56,72,77,87,93,100,166],"training":[29,75,142],"stability.":[30,78],"The":[31,103,132],"includes":[34],"78":[35],"layers":[36],"within":[37],"the":[38,61,67,107,115,126,175],"generators'":[39],"architecture":[40],"in":[41,141],"order":[42],"capture":[44],"complex":[45],"linguistic":[46],"features":[47,51],"preserve":[49],"phonetic":[50],"(e.g.,":[52],"intonations,":[53],"vowel":[54],"nasalizations,":[55],"pronunciations).":[57],"Additionally,":[58],"we":[59],"adopted":[60],"ReLU6":[62],"activation":[63],"function":[64],"reduce":[66],"risk":[68],"of":[69,130,164],"exploding":[70],"gradients":[71],"enhance":[74],"convergence":[76],"Generated":[79],"samples":[80,105],"at":[81],"22.04":[83],"kHz":[84],"sampling":[85],"rate":[86],"evaluated":[88],"them":[89],"with":[90,121],"subjective,":[91],"objective,":[92],"quantitative":[94],"measures":[95],"assess":[97],"their":[98],"fidelity":[99],"perceptual":[101],"quality.":[102],"generated":[104],"from":[106],"V1":[109],"achieved":[111],"4.65":[113],"5-scale":[116],"Mean":[117],"Opinion":[118],"Score":[119],"(MOS),":[120],"improvement":[124],"over":[125],"model's":[128],"score":[129],"+0.29.":[131],"V2":[135],"achieves":[137],"51.10%":[139],"reduction":[140],"time":[143],"an":[145],"NVIDIA":[146],"Quadro":[147],"XP100":[148],"16":[149],"GB":[150],"VRAM":[151],"GPU,":[152],"while":[153],"audio":[155],"metrics":[157],"up":[159],"+2.006":[161],"(in":[162],"terms":[163],"PESQ),":[165],"reducing":[167],"spectral":[168],"distortion":[169],"(MCD)":[170],"75.6%,":[172],"compared":[173],"baseline.":[176]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
