{"id":"https://openalex.org/W4416799361","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249045","title":"BAANI: A 296M-Parameter Neural Vocoder for End-To-End Punjabi Speech Synthesis","display_name":"BAANI: A 296M-Parameter Neural Vocoder for End-To-End Punjabi Speech Synthesis","publication_year":2025,"publication_date":"2025-10-22","ids":{"openalex":"https://openalex.org/W4416799361","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249045"},"language":null,"primary_location":{"id":"doi:10.1109/apsipaasc65261.2025.11249045","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249045","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049511943","display_name":"Siddharth Krishna Kumar","orcid":"https://orcid.org/0000-0003-3606-5060"},"institutions":[{"id":"https://openalex.org/I98389781","display_name":"Dhirubhai Ambani Institute of Information and Communication Technology","ror":"https://ror.org/02d5b7g69","country_code":"IN","type":"education","lineage":["https://openalex.org/I98389781"]}],"countries":["IN"],"is_corresponding":true,"raw_author_name":"Siddharth Kumar","raw_affiliation_strings":["Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India"],"affiliations":[{"raw_affiliation_string":"Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India","institution_ids":["https://openalex.org/I98389781"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013098603","display_name":"Nisarg Trivedi","orcid":null},"institutions":[{"id":"https://openalex.org/I98389781","display_name":"Dhirubhai Ambani Institute of Information and Communication Technology","ror":"https://ror.org/02d5b7g69","country_code":"IN","type":"education","lineage":["https://openalex.org/I98389781"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Nisarg Trivedi","raw_affiliation_strings":["Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India"],"affiliations":[{"raw_affiliation_string":"Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India","institution_ids":["https://openalex.org/I98389781"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114974321","display_name":"Ravindrakumar M. Purohit","orcid":null},"institutions":[{"id":"https://openalex.org/I98389781","display_name":"Dhirubhai Ambani Institute of Information and Communication Technology","ror":"https://ror.org/02d5b7g69","country_code":"IN","type":"education","lineage":["https://openalex.org/I98389781"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Ravindrakumar M. Purohit","raw_affiliation_strings":["Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India"],"affiliations":[{"raw_affiliation_string":"Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India","institution_ids":["https://openalex.org/I98389781"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5113880393","display_name":"Hemant A. Patil","orcid":null},"institutions":[{"id":"https://openalex.org/I98389781","display_name":"Dhirubhai Ambani Institute of Information and Communication Technology","ror":"https://ror.org/02d5b7g69","country_code":"IN","type":"education","lineage":["https://openalex.org/I98389781"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Hemant A. Patil","raw_affiliation_strings":["Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India"],"affiliations":[{"raw_affiliation_string":"Dhirubhai Ambani University (formerly DA-IICT),Speech Research Lab,Gandhinagar,GJ,India","institution_ids":["https://openalex.org/I98389781"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5049511943"],"corresponding_institution_ids":["https://openalex.org/I98389781"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.2072358,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"897","last_page":"902"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8999999761581421,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8999999761581421,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.020099999383091927,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.010999999940395355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.8529999852180481},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.6500999927520752},{"id":"https://openalex.org/keywords/intelligibility","display_name":"Intelligibility (philosophy)","score":0.6489999890327454},{"id":"https://openalex.org/keywords/mean-opinion-score","display_name":"Mean opinion score","score":0.6413000226020813},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.5069000124931335},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.39149999618530273},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.37130001187324524}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.8529999852180481},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7544000148773193},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6887000203132629},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.6500999927520752},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.6489999890327454},{"id":"https://openalex.org/C62897895","wikidata":"https://www.wikidata.org/wiki/Q1915482","display_name":"Mean opinion score","level":3,"score":0.6413000226020813},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.5069000124931335},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.39149999618530273},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3865000009536743},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.37130001187324524},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3677999973297119},{"id":"https://openalex.org/C188414643","wikidata":"https://www.wikidata.org/wiki/Q3001183","display_name":"Harmonics","level":3,"score":0.34779998660087585},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.3294000029563904},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.31439998745918274},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.3070000112056732},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.2903999984264374},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.2775000035762787},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.27129998803138733},{"id":"https://openalex.org/C99209842","wikidata":"https://www.wikidata.org/wiki/Q643696","display_name":"Speech perception","level":3,"score":0.2703999876976013}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/apsipaasc65261.2025.11249045","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249045","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W1552314771","https://openalex.org/W2107860279","https://openalex.org/W2141998673","https://openalex.org/W2327501763","https://openalex.org/W2963300588","https://openalex.org/W2964243274","https://openalex.org/W3015338123","https://openalex.org/W3097828251","https://openalex.org/W3196475561","https://openalex.org/W3197273793","https://openalex.org/W4285605725","https://openalex.org/W4299627282"],"related_works":[],"abstract_inverted_index":{"In":[0],"this":[1],"work,":[2],"we":[3],"present":[4],"BAANI,":[5],"a":[6,31,77],"neural":[7,89],"vocoder":[8,162],"comprising":[9],"296":[10],"million":[11],"parameters,":[12],"designed":[13],"for":[14],"end-to-end":[15],"speech":[16,127],"synthesis":[17],"(SS)":[18],"in":[19],"the":[20,24,41,93,101,107,118,143,149,161,177],"Punjabi":[21,35,56,182],"language.":[22],"Recognizing":[23],"unique":[25],"phonetic":[26],"and":[27,33,43,58,66,83,137,148,167],"prosodic":[28],"characteristics":[29],"of":[30,45,95,132,180],"low-resource":[32],"underrepresented":[34],"language,":[36],"BAANI":[37,123],"aims":[38],"to":[39,91],"enhance":[40],"naturalness":[42],"intelligibility":[44],"synthesized":[46],"speech.":[47],"The":[48],"proposed":[49],"model":[50],"is":[51],"trained":[52],"on":[53,100,142],"an":[54,61,67],"IndicTTS":[55],"corpus":[57],"evaluated":[59],"using":[60],"NVIDIA":[62,144],"GTX":[63,145],"1080":[64,146],"GPU":[65,147],"Intel":[68,150],"Core":[69,151],"i7":[70],"12th":[71],"Gen":[72],"CPU-powered":[73],"system.":[74],"We":[75],"conducted":[76],"comparative":[78],"analysis":[79],"(e.g.,":[80],"subjective,":[81],"objective,":[82],"quantitative":[84],"metrics)":[85],"with":[86,128,176],"several":[87],"state-of-the-art":[88],"vocoders":[90],"validate":[92],"effectiveness":[94],"BAANI.":[96],"It":[97],"achieved":[98],"4.18":[99],"Mean":[102],"Opinion":[103],"Score":[104],"(MOS).":[105],"On":[106],"other":[108],"side,":[109],"from":[110],"five":[111],"different":[112],"objective":[113],"measures,":[114],"all":[115],"measures":[116],"surpass":[117],"current":[119],"SOTA":[120],"models.":[121],"Furthermore,":[122],"generates":[124],"22.05":[125],"kHz":[126],"real-time":[129],"factors":[130],"(RTF)":[131],"<tex":[133,138,170],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[134,139,171],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$3.2":[135],"\\times$</tex>":[136,141],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$1.5":[140],"i7-12700":[152],"CPU,":[153],"respectively,":[154],"demonstrating":[155],"its":[156],"practical":[157],"deployment":[158],"potential.":[159],"Notably,":[160],"effectively":[163],"preserves":[164],"high-frequency":[165],"harmonics":[166],"pitch":[168],"(i.e.,":[169],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$F_{0}$</tex>)":[172],"contours,":[173],"aligning":[174],"closely":[175],"perceptual":[178],"preferences":[179],"native":[181],"speakers.":[183]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-28T00:00:00"}
