{"id":"https://openalex.org/W7127393331","doi":"https://doi.org/10.48550/arxiv.2602.01793","title":"ParaGSE: Parallel Generative Speech Enhancement with Group-Vector-Quantization-based Neural Speech Codec","display_name":"ParaGSE: Parallel Generative Speech Enhancement with Group-Vector-Quantization-based Neural Speech Codec","publication_year":2026,"publication_date":"2026-02-02","ids":{"openalex":"https://openalex.org/W7127393331","doi":"https://doi.org/10.48550/arxiv.2602.01793"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.01793","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124885901","display_name":"Fei Fei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Fei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124890934","display_name":"Yang Ai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ai, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9890999794006348,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9890999794006348,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.0031999999191612005,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.002400000113993883,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.635699987411499},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.618399977684021},{"id":"https://openalex.org/keywords/codec2","display_name":"Codec2","score":0.6176999807357788},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.5978000164031982},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.5649999976158142},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.5602999925613403},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.5468000173568726},{"id":"https://openalex.org/keywords/adaptive-multi-rate-audio-codec","display_name":"Adaptive Multi-Rate audio codec","score":0.5321999788284302},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.5284000039100647}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7662000060081482},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6988000273704529},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.635699987411499},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.618399977684021},{"id":"https://openalex.org/C75217168","wikidata":"https://www.wikidata.org/wiki/Q1105653","display_name":"Codec2","level":4,"score":0.6176999807357788},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.5978000164031982},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.5649999976158142},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.5602999925613403},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.5468000173568726},{"id":"https://openalex.org/C177067256","wikidata":"https://www.wikidata.org/wiki/Q4676210","display_name":"Adaptive Multi-Rate audio codec","level":4,"score":0.5321999788284302},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.5284000039100647},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.41850000619888306},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.4097000062465668},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.3869999945163727},{"id":"https://openalex.org/C199833920","wikidata":"https://www.wikidata.org/wiki/Q612536","display_name":"Vector quantization","level":2,"score":0.36980000138282776},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.3646000027656555},{"id":"https://openalex.org/C59883199","wikidata":"https://www.wikidata.org/wiki/Q1826438","display_name":"Linear predictive coding","level":3,"score":0.3637999892234802},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3617999851703644},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.3513999879360199},{"id":"https://openalex.org/C77637269","wikidata":"https://www.wikidata.org/wiki/Q7002051","display_name":"Neural coding","level":2,"score":0.33480000495910645},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.3337000012397766},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3255999982357025},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.30730000138282776},{"id":"https://openalex.org/C108699837","wikidata":"https://www.wikidata.org/wiki/Q7120750","display_name":"PSQM","level":4,"score":0.2825999855995178}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.01793","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.01793","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.01793","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.01793","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities","score":0.7178062200546265}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recently,":[0],"generative":[1,32,115,152],"speech":[2,20,33,45,74,95,109,153],"enhancement":[3,34,154],"has":[4],"garnered":[5],"considerable":[6],"interest;":[7],"however,":[8],"existing":[9],"approaches":[10],"are":[11],"hindered":[12],"by":[13,132],"excessive":[14],"complexity,":[15],"limited":[16],"efficiency,":[17],"and":[18,91,114,127],"suboptimal":[19],"quality.":[21],"To":[22],"overcome":[23],"these":[24],"challenges,":[25],"this":[26],"paper":[27],"proposes":[28],"a":[29,39,118,141],"novel":[30],"parallel":[31,60,84,133],"(ParaGSE)":[35],"framework":[36],"that":[37,103],"leverages":[38,67],"group":[40],"vector":[41],"quantization":[42],"(GVQ)-based":[43],"neural":[44],"codec.":[46],"The":[47],"GVQ-based":[48,69],"codec":[49,70,98],"adopts":[50],"separate":[51],"VQs":[52],"to":[53,71,111],"produce":[54],"mutually":[55],"independent":[56],"tokens,":[57,77],"enabling":[58],"efficient":[59],"token":[61,136],"prediction":[62],"in":[63,135,144],"ParaGSE.":[64],"Specifically,":[65],"ParaGSE":[66,104,138],"the":[68,79,97],"encode":[72],"degraded":[73,88],"into":[75],"distinct":[76],"predicts":[78],"corresponding":[80],"clean":[81,94],"tokens":[82],"through":[83],"branches":[85],"conditioned":[86],"on":[87,147],"spectral":[89],"features,":[90],"ultimately":[92],"reconstructs":[93],"via":[96],"decoder.":[99],"Experimental":[100],"results":[101],"demonstrate":[102],"consistently":[105],"produces":[106],"superior":[107],"enhanced":[108],"compared":[110,149],"both":[112],"discriminative":[113],"baselines,":[116],"under":[117],"wide":[119],"range":[120],"of":[121],"distortions":[122],"including":[123],"noise,":[124],"reverberation,":[125],"band-limiting,":[126],"their":[128],"mixtures.":[129],"Furthermore,":[130],"empowered":[131],"computation":[134],"prediction,":[137],"attains":[139],"about":[140],"1.5-fold":[142],"improvement":[143],"generation":[145],"efficiency":[146],"CPU":[148],"with":[150],"serial":[151],"approaches.":[155]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-04T00:00:00"}
