{"id":"https://openalex.org/W7155373618","doi":"https://doi.org/10.48550/arxiv.2604.20842","title":"SpeechParaling-Bench: A Comprehensive Benchmark for Paralinguistic-Aware Speech Generation","display_name":"SpeechParaling-Bench: A Comprehensive Benchmark for Paralinguistic-Aware Speech Generation","publication_year":2026,"publication_date":"2026-04-22","ids":{"openalex":"https://openalex.org/W7155373618","doi":"https://doi.org/10.48550/arxiv.2604.20842"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.20842","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20842","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.20842","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134461576","display_name":"Ruohan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ruohan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073775343","display_name":"Shukang Yin","orcid":"https://orcid.org/0000-0002-5356-1800"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Shukang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100453716","display_name":"Tao Wang","orcid":"https://orcid.org/0000-0003-4535-3830"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Tao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134426015","display_name":"Dong Yan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Dong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113987256","display_name":"Weiji Zhuang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuang, Weiji","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134402501","display_name":"Shuhuai Ren","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ren, Shuhuai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134435253","display_name":"Ran He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Ran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134386007","display_name":"Caifeng Shan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Caifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5014172220","display_name":"Chaoyou Fu","orcid":"https://orcid.org/0000-0002-0079-7668"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Chaoyou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.3776000142097473,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.3776000142097473,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.2207999974489212,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12128","display_name":"AI in Service Interactions","score":0.1136000007390976,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/paralanguage","display_name":"Paralanguage","score":0.972599983215332},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6051999926567078},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.5004000067710876},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.3776000142097473},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.3456999957561493},{"id":"https://openalex.org/keywords/mel-frequency-cepstrum","display_name":"Mel-frequency cepstrum","score":0.34150001406669617},{"id":"https://openalex.org/keywords/framing","display_name":"Framing (construction)","score":0.3352999985218048}],"concepts":[{"id":"https://openalex.org/C133378560","wikidata":"https://www.wikidata.org/wiki/Q1753225","display_name":"Paralanguage","level":2,"score":0.972599983215332},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7002999782562256},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6051999926567078},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.5004000067710876},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.45559999346733093},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44339999556541443},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.391400009393692},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.3776000142097473},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3456999957561493},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3433000147342682},{"id":"https://openalex.org/C151989614","wikidata":"https://www.wikidata.org/wiki/Q440370","display_name":"Mel-frequency cepstrum","level":3,"score":0.34150001406669617},{"id":"https://openalex.org/C169087156","wikidata":"https://www.wikidata.org/wiki/Q2131593","display_name":"Framing (construction)","level":2,"score":0.3352999985218048},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.31779998540878296},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.30880001187324524},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.28529998660087585},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2815999984741211},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2540999948978424},{"id":"https://openalex.org/C202889954","wikidata":"https://www.wikidata.org/wiki/Q1139554","display_name":"Subjectivity","level":2,"score":0.2533999979496002}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.20842","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20842","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.20842","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20842","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5690885186195374}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Paralinguistic":[0],"cues":[1,158],"are":[2,94],"essential":[3],"for":[4,38,160,172],"natural":[5],"human-computer":[6],"interaction,":[7],"yet":[8],"their":[9],"evaluation":[10,106],"in":[11,90,134,164],"Large":[12],"Audio-Language":[13],"Models":[14],"(LALMs)":[15],"remains":[16],"limited":[17],"by":[18,56,100],"coarse":[19],"feature":[20],"coverage":[21,45],"and":[22,64,76,118,122,146],"the":[23,170],"inherent":[24],"subjectivity":[25,117],"of":[26,149,162],"assessment.":[27],"To":[28,79],"address":[29],"these":[30],"challenges,":[31],"we":[32,83],"introduce":[33],"SpeechParaling-Bench,":[34],"a":[35,86,97],"comprehensive":[36,143],"benchmark":[37],"paralinguistic-aware":[39],"speech":[40,62],"generation.":[41],"It":[42],"expands":[43],"existing":[44],"from":[46],"fewer":[47],"than":[48,58,111],"50":[49],"to":[50,154],"over":[51],"100":[52],"fine-grained":[53,72],"features,":[54,151],"supported":[55],"more":[57,120,173],"1,000":[59],"English-Chinese":[60],"parallel":[61],"queries,":[63],"is":[65],"organized":[66],"into":[67],"three":[68],"progressively":[69],"challenging":[70],"tasks:":[71],"control,":[73],"intra-utterance":[74],"variation,":[75],"context-aware":[77],"adaptation.":[78],"enable":[80],"reliable":[81],"evaluation,":[82],"further":[84],"develop":[85],"pairwise":[87],"comparison":[88],"pipeline,":[89],"which":[91],"candidate":[92],"responses":[93],"evaluated":[95],"against":[96],"fixed":[98],"baseline":[99],"an":[101],"LALM-based":[102],"judge.":[103],"By":[104],"framing":[105],"as":[107],"relative":[108],"preference":[109],"rather":[110],"absolute":[112],"scoring,":[113],"this":[114],"approach":[115],"mitigates":[116],"yields":[119],"stable":[121],"scalable":[123],"assessments":[124],"without":[125],"costly":[126],"human":[127],"annotation.":[128],"Extensive":[129],"experiments":[130],"reveal":[131],"substantial":[132],"limitations":[133],"current":[135],"LALMs.":[136],"Even":[137],"leading":[138],"proprietary":[139],"models":[140],"struggle":[141],"with":[142],"static":[144],"control":[145],"dynamic":[147],"modulation":[148],"paralinguistic":[150,157,175],"while":[152],"failure":[153],"correctly":[155],"interpret":[156],"accounts":[159],"43.3%":[161],"errors":[163],"situational":[165],"dialogue.":[166],"These":[167],"findings":[168],"underscore":[169],"need":[171],"robust":[174],"modeling":[176],"toward":[177],"human-aligned":[178],"voice":[179],"assistants.":[180]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-04-24T00:00:00"}
