{"id":"https://openalex.org/W4402352797","doi":"https://doi.org/10.1109/ijcnn60899.2024.10651390","title":"Generative Spoken Language Modeling with Quantized Feature Enhancement","display_name":"Generative Spoken Language Modeling with Quantized Feature Enhancement","publication_year":2024,"publication_date":"2024-06-30","ids":{"openalex":"https://openalex.org/W4402352797","doi":"https://doi.org/10.1109/ijcnn60899.2024.10651390"},"language":"en","primary_location":{"id":"doi:10.1109/ijcnn60899.2024.10651390","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/ijcnn60899.2024.10651390","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Feiyu Duan","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Feiyu Duan","raw_affiliation_strings":["Beihang University,Sino-French Engineer School,China"],"affiliations":[{"raw_affiliation_string":"Beihang University,Sino-French Engineer School,China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100369785","display_name":"Chen Li","orcid":"https://orcid.org/0000-0003-2523-9141"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chen Li","raw_affiliation_strings":["Beihang University,School of Computer Science and Engineering,China"],"affiliations":[{"raw_affiliation_string":"Beihang University,School of Computer Science and Engineering,China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082423788","display_name":"Keheng Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Keheng Wang","raw_affiliation_strings":["Beihang University,Sino-French Engineer School,China"],"affiliations":[{"raw_affiliation_string":"Beihang University,Sino-French Engineer School,China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080312610","display_name":"Si Wu","orcid":"https://orcid.org/0000-0003-0188-1121"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Si Wu","raw_affiliation_strings":["Orange R&#x0026;D Beijing Company Limited,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Orange R&#x0026;D Beijing Company Limited,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079167910","display_name":"Chuantao Yin","orcid":"https://orcid.org/0000-0002-0742-0804"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chuantao Yin","raw_affiliation_strings":["Beihang University,Sino-French Engineer School,China"],"affiliations":[{"raw_affiliation_string":"Beihang University,Sino-French Engineer School,China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055420596","display_name":"Wenge Rong","orcid":"https://orcid.org/0000-0002-4229-7215"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenge Rong","raw_affiliation_strings":["Beihang University,School of Computer Science and Engineering,China"],"affiliations":[{"raw_affiliation_string":"Beihang University,School of Computer Science and Engineering,China","institution_ids":["https://openalex.org/I82880672"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I82880672"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.12729997,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"202","issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7602020502090454},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.6552918553352356},{"id":"https://openalex.org/keywords/spoken-language","display_name":"Spoken language","score":0.6347655653953552},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.6074316501617432},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5608361959457397},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5001034736633301},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4407532215118408},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.42783018946647644},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.32244932651519775}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7602020502090454},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.6552918553352356},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.6347655653953552},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.6074316501617432},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5608361959457397},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5001034736633301},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4407532215118408},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.42783018946647644},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.32244932651519775},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ijcnn60899.2024.10651390","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/ijcnn60899.2024.10651390","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6499999761581421,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2091735588","https://openalex.org/W2752796333","https://openalex.org/W2842511635","https://openalex.org/W2913668833","https://openalex.org/W2933138175","https://openalex.org/W2963300588","https://openalex.org/W2964243274","https://openalex.org/W3016021263","https://openalex.org/W3095199334","https://openalex.org/W3140429000","https://openalex.org/W3148101939","https://openalex.org/W3198217962","https://openalex.org/W3209059054","https://openalex.org/W3215615641","https://openalex.org/W4210433094","https://openalex.org/W4226033575","https://openalex.org/W4287802874","https://openalex.org/W4287854499","https://openalex.org/W4297808394","https://openalex.org/W4311481261","https://openalex.org/W4313679638","https://openalex.org/W4379539302","https://openalex.org/W4381786045","https://openalex.org/W4385822336","https://openalex.org/W4385823264","https://openalex.org/W4385823372","https://openalex.org/W4386065395","https://openalex.org/W4389524500","https://openalex.org/W4394671563","https://openalex.org/W6750489868","https://openalex.org/W6762643587","https://openalex.org/W6776218486","https://openalex.org/W6778883912","https://openalex.org/W6780218876","https://openalex.org/W6790356757","https://openalex.org/W6795949861","https://openalex.org/W6839738141","https://openalex.org/W6847568899","https://openalex.org/W6848735303","https://openalex.org/W6852859116"],"related_works":["https://openalex.org/W4365211920","https://openalex.org/W3014948380","https://openalex.org/W4380551139","https://openalex.org/W4317695495","https://openalex.org/W4287117424","https://openalex.org/W4387506531","https://openalex.org/W4238433571","https://openalex.org/W2967848559","https://openalex.org/W4299831724","https://openalex.org/W4283803360"],"abstract_inverted_index":{"In":[0],"the":[1,54,57,78,86,97,106,111,128],"absence":[2],"of":[3,99],"text,":[4],"training":[5],"generative":[6,47,79],"models":[7],"directly":[8],"on":[9,39,105],"speech":[10,27,59],"data":[11,28],"through":[12,91],"next":[13],"token":[14],"prediction":[15],"task,":[16],"similar":[17],"to":[18,35],"text-based":[19],"language":[20,49,81,88],"models,":[21],"has":[22],"demonstrated":[23],"its":[24,61],"feasibility.":[25],"However,":[26],"encompasses":[29],"more":[30],"intricate":[31],"feature":[32],"information":[33],"compared":[34],"text.":[36],"To":[37,95],"capitalize":[38],"these":[40],"additional":[41],"features,":[42],"we":[43,102],"propose":[44],"a":[45,69,92],"feature-enhanced":[46],"spoken":[48,80],"modeling":[50,82],"(fGSLM).":[51],"We":[52],"calculate":[53],"difference":[55],"between":[56],"original":[58],"and":[60,64,130,133],"normalized":[62],"version,":[63],"extract":[65],"quantized":[66],"features":[67,73],"with":[68],"VQVAE-structured":[70],"model.":[71],"These":[72],"are":[74],"subsequently":[75],"integrated":[76],"into":[77],"(GSLM)":[83],"by":[84],"fine-tuning":[85],"unit":[87],"model":[89,121],"(uLM)":[90],"multi-stream":[93],"transformer.":[94],"evaluate":[96],"effectiveness":[98],"our":[100,120],"model,":[101],"conduct":[103],"experiments":[104],"ProsAudit":[107],"evaluation":[108],"task":[109],"in":[110],"Zero":[112],"Resource":[113],"Speech":[114],"Challenge.":[115],"Experimental":[116],"results":[117],"show":[118],"that":[119],"significantly":[122],"improves":[123],"prosody":[124],"comprehension":[125],"both":[126],"at":[127],"sentence":[129],"lexical":[131],"levels,":[132],"achieves":[134],"superior":[135],"performance":[136],"against":[137],"baseline":[138],"models.":[139]},"counts_by_year":[],"updated_date":"2026-04-17T18:11:37.981687","created_date":"2025-10-10T00:00:00"}
