{"id":"https://openalex.org/W4416251549","doi":"https://doi.org/10.1109/ijcnn64981.2025.11228511","title":"V-CASS: Vision-context-aware Expressive Speech Synthesis for Enhancing User Understanding of Videos","display_name":"V-CASS: Vision-context-aware Expressive Speech Synthesis for Enhancing User Understanding of Videos","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4416251549","doi":"https://doi.org/10.1109/ijcnn64981.2025.11228511"},"language":null,"primary_location":{"id":"doi:10.1109/ijcnn64981.2025.11228511","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11228511","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101918812","display_name":"Qixin Wang","orcid":"https://orcid.org/0009-0009-5832-8192"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Qixin Wang","raw_affiliation_strings":["Tsinghua University,Department of Computer Science and Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Computer Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026277472","display_name":"Songtao Zhou","orcid":"https://orcid.org/0009-0008-5972-3955"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Songtao Zhou","raw_affiliation_strings":["Tsinghua University,Department of Computer Science and Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Computer Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033137691","display_name":"Zeyu Jin","orcid":"https://orcid.org/0000-0001-8465-8878"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zeyu Jin","raw_affiliation_strings":["Tsinghua University,Department of Computer Science and Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Computer Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110430790","display_name":"Cuiting Guo","orcid":"https://orcid.org/0009-0003-5489-8004"},"institutions":[{"id":"https://openalex.org/I57206974","display_name":"New York University","ror":"https://ror.org/0190ak572","country_code":"US","type":"education","lineage":["https://openalex.org/I57206974"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chenglin Guo","raw_affiliation_strings":["New York University,Stern School of Business,New York,USA"],"affiliations":[{"raw_affiliation_string":"New York University,Stern School of Business,New York,USA","institution_ids":["https://openalex.org/I57206974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073997389","display_name":"Shikun Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shikun Sun","raw_affiliation_strings":["Tsinghua University,Department of Computer Science and Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Computer Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018252335","display_name":"Xiaoyu Qin","orcid":"https://orcid.org/0000-0002-9720-3220"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyu Qin","raw_affiliation_strings":["Tsinghua University,Department of Computer Science and Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Department of Computer Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5101918812"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.37471407,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5710999965667725,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5710999965667725,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.15230000019073486,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.07000000029802322,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/paralanguage","display_name":"Paralanguage","score":0.8949000239372253},{"id":"https://openalex.org/keywords/meaning","display_name":"Meaning (existential)","score":0.5371000170707703},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.5350000262260437},{"id":"https://openalex.org/keywords/social-media","display_name":"Social media","score":0.387800008058548},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.37770000100135803},{"id":"https://openalex.org/keywords/social-cue","display_name":"Social cue","score":0.31200000643730164},{"id":"https://openalex.org/keywords/speech-technology","display_name":"Speech technology","score":0.2937999963760376}],"concepts":[{"id":"https://openalex.org/C133378560","wikidata":"https://www.wikidata.org/wiki/Q1753225","display_name":"Paralanguage","level":2,"score":0.8949000239372253},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7562000155448914},{"id":"https://openalex.org/C2780876879","wikidata":"https://www.wikidata.org/wiki/Q3054749","display_name":"Meaning (existential)","level":2,"score":0.5371000170707703},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.5350000262260437},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4867999851703644},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.4616999924182892},{"id":"https://openalex.org/C518677369","wikidata":"https://www.wikidata.org/wiki/Q202833","display_name":"Social media","level":2,"score":0.387800008058548},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.37770000100135803},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32580000162124634},{"id":"https://openalex.org/C10090317","wikidata":"https://www.wikidata.org/wiki/Q7551030","display_name":"Social cue","level":2,"score":0.31200000643730164},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.2937999963760376},{"id":"https://openalex.org/C88006597","wikidata":"https://www.wikidata.org/wiki/Q690117","display_name":"Disk formatting","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2851000130176544},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.2630000114440918},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.25440001487731934},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2533000111579895},{"id":"https://openalex.org/C201025465","wikidata":"https://www.wikidata.org/wiki/Q11248500","display_name":"User experience design","level":2,"score":0.2531000077724457}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ijcnn64981.2025.11228511","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11228511","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W1971552587","https://openalex.org/W1976725440","https://openalex.org/W2008264338","https://openalex.org/W2043181832","https://openalex.org/W2475747270","https://openalex.org/W2619383789","https://openalex.org/W2889539916","https://openalex.org/W2908591608","https://openalex.org/W2963609956","https://openalex.org/W2964051877","https://openalex.org/W2970641574","https://openalex.org/W3007605881","https://openalex.org/W3028100120","https://openalex.org/W3096650361","https://openalex.org/W3096806995","https://openalex.org/W3112404983","https://openalex.org/W3133511908","https://openalex.org/W3165885516","https://openalex.org/W3181722707","https://openalex.org/W3206945595","https://openalex.org/W3210314917","https://openalex.org/W4214616627","https://openalex.org/W4372263908","https://openalex.org/W4376610680","https://openalex.org/W4382202825","https://openalex.org/W4386075767","https://openalex.org/W4392904245","https://openalex.org/W4393152865","https://openalex.org/W4398152753","https://openalex.org/W4402671522","https://openalex.org/W4402705930","https://openalex.org/W4402816831","https://openalex.org/W4411112979"],"related_works":[],"abstract_inverted_index":{"Automatic":[0],"video":[1,17,93],"commentary":[2,94],"systems":[3,21],"are":[4,32],"widely":[5],"used":[6],"on":[7,74],"multimedia":[8],"social":[9],"media":[10],"platforms":[11],"to":[12,108,149],"extract":[13],"factual":[14],"information":[15],"about":[16],"content.":[18,41],"However,":[19],"current":[20],"may":[22,117],"overlook":[23],"essential":[24],"paralinguistic":[25],"cues,":[26],"including":[27],"emotion":[28],"and":[29,66,110,115,143,166,175,195],"attitude,":[30],"which":[31],"critical":[33],"for":[34],"fully":[35],"conveying":[36],"the":[37,56,68,80,151,182,187],"meaning":[38],"of":[39,44,71,82,90,179,189],"visual":[40],"The":[42],"absence":[43],"these":[45,64,75],"cues":[46,65,136],"can":[47,106],"limit":[48],"user":[49,172],"understanding":[50,89,174],"or,":[51],"in":[52,86,92,155,192],"some":[53],"cases,":[54],"distort":[55,118],"video\u2019s":[57],"original":[58],"intent.":[59],"Expressive":[60],"speech":[61,85,105,114,130,153],"effectively":[62],"conveys":[63],"enhances":[67,164],"user\u2019s":[69],"comprehension":[70],"videos.":[72],"Building":[73],"insights,":[76],"this":[77],"paper":[78],"explores":[79],"usage":[81],"vision-context-aware":[83,129],"expressive":[84,152],"enhancing":[87],"users\u2019":[88],"videos":[91],"systems<sup":[95],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[96],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>.":[97],"Firstly,":[98],"our":[99,190],"formatting":[100],"study":[101],"indicates":[102],"that":[103,162],"semantic-only":[104],"lead":[107],"ambiguity,":[109],"misaligned":[111],"emotions":[112],"between":[113],"visuals":[116,138],"content":[119],"interpretation.":[120],"To":[121],"address":[122],"this,":[123],"we":[124,185],"propose":[125],"a":[126,140,145],"method":[127,191],"called":[128],"synthesis":[131],"(V-CASS).":[132],"It":[133],"analyzes":[134],"para-linguistic":[135],"from":[137],"using":[139],"vision-language":[141],"model":[142,148,154],"leverages":[144],"knowledge-infused":[146],"language":[147],"guide":[150],"generating":[156],"context-aligned":[157],"speech.":[158],"User":[159],"studies":[160],"show":[161],"V-CASS":[163],"emotional":[165],"attitudinal":[167],"resonance,":[168],"as":[169,171],"well":[170],"audio-visual":[173],"engagement,":[176],"with":[177],"74.68%":[178],"participants":[180],"preferring":[181],"system.":[183],"Finally,":[184],"explore":[186],"potential":[188],"helping":[193],"blind":[194],"low-vision":[196],"users":[197],"navigate":[198],"web":[199],"videos,":[200],"improving":[201],"universal":[202],"accessibility.":[203]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
