{"id":"https://openalex.org/W4416798814","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249147","title":"I <sup>2</sup> TTS: Image-Indicated Immersive Text-to-Speech Synthesis with Spatial Perception","display_name":"I <sup>2</sup> TTS: Image-Indicated Immersive Text-to-Speech Synthesis with Spatial Perception","publication_year":2025,"publication_date":"2025-10-22","ids":{"openalex":"https://openalex.org/W4416798814","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249147"},"language":null,"primary_location":{"id":"doi:10.1109/apsipaasc65261.2025.11249147","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249147","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100462825","display_name":"Jiawei Zhang","orcid":"https://orcid.org/0000-0002-2292-4592"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiawei Zhang","raw_affiliation_strings":["University of Science and Technology Beijing,Beijing,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology Beijing,Beijing,China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100669565","display_name":"Tianhao Zhang","orcid":"https://orcid.org/0000-0002-5939-3932"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tian-Hao Zhang","raw_affiliation_strings":["University of Science and Technology Beijing,Beijing,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology Beijing,Beijing,China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110195480","display_name":"Jun Wang","orcid":"https://orcid.org/0000-0001-8932-6661"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Wang","raw_affiliation_strings":["Tencent AI Lab,Shenzhen,China"],"affiliations":[{"raw_affiliation_string":"Tencent AI Lab,Shenzhen,China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jiaran Gao","orcid":null},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiaran Gao","raw_affiliation_strings":["University of Science and Technology Beijing,Beijing,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology Beijing,Beijing,China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026034735","display_name":"Ruijie Tao","orcid":"https://orcid.org/0000-0003-0021-5661"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Ruijie Tao","raw_affiliation_strings":["National University of Singapore,Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore,Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056495776","display_name":"Xinyuan Qian","orcid":"https://orcid.org/0000-0002-9511-6713"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinyuan Qian","raw_affiliation_strings":["University of Science and Technology Beijing,Beijing,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology Beijing,Beijing,China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5074514262","display_name":"Xu-Cheng Yin","orcid":"https://orcid.org/0000-0003-0023-0220"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xu-Cheng Yin","raw_affiliation_strings":["University of Science and Technology Beijing,Beijing,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology Beijing,Beijing,China","institution_ids":["https://openalex.org/I92403157"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100462825"],"corresponding_institution_ids":["https://openalex.org/I92403157"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.43753519,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"771","last_page":"776"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8363999724388123,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8363999724388123,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.07779999822378159,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.02199999988079071,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5759000182151794},{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.5629000067710876},{"id":"https://openalex.org/keywords/virtual-reality","display_name":"Virtual reality","score":0.47600001096725464},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.44769999384880066},{"id":"https://openalex.org/keywords/reverberation","display_name":"Reverberation","score":0.40450000762939453},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4018999934196472},{"id":"https://openalex.org/keywords/intelligibility","display_name":"Intelligibility (philosophy)","score":0.35179999470710754}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.781000018119812},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5957000255584717},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5759000182151794},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.5629000067710876},{"id":"https://openalex.org/C194969405","wikidata":"https://www.wikidata.org/wiki/Q170519","display_name":"Virtual reality","level":2,"score":0.47600001096725464},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.44769999384880066},{"id":"https://openalex.org/C95851461","wikidata":"https://www.wikidata.org/wiki/Q468809","display_name":"Reverberation","level":2,"score":0.40450000762939453},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4018999934196472},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3856000006198883},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.35179999470710754},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3467000126838684},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.3440000116825104},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.3384999930858612},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.3174999952316284},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.30790001153945923},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C177284502","wikidata":"https://www.wikidata.org/wiki/Q1005390","display_name":"Adapter (computing)","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.26100000739097595},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.2578999996185303},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2549000084400177}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/apsipaasc65261.2025.11249147","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249147","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1610785064","display_name":null,"funder_award_id":"L233032","funder_id":"https://openalex.org/F4320322919","funder_display_name":"Natural Science Foundation of Beijing Municipality"},{"id":"https://openalex.org/G5544929551","display_name":null,"funder_award_id":"K00120240007","funder_id":"https://openalex.org/F4320331102","funder_display_name":"Shenzhen Research Institute of Big Data"},{"id":"https://openalex.org/G6648122539","display_name":null,"funder_award_id":"62306029","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322919","display_name":"Natural Science Foundation of Beijing Municipality","ror":null},{"id":"https://openalex.org/F4320331102","display_name":"Shenzhen Research Institute of Big Data","ror":"https://ror.org/00z1gwf89"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W344150399","https://openalex.org/W2107860279","https://openalex.org/W2962858109","https://openalex.org/W2963609956","https://openalex.org/W2998572311","https://openalex.org/W3098557217","https://openalex.org/W3139813613","https://openalex.org/W3205925339","https://openalex.org/W4304099317","https://openalex.org/W4312779270","https://openalex.org/W4375869257","https://openalex.org/W4381786045","https://openalex.org/W4389519221","https://openalex.org/W4392902843","https://openalex.org/W4393152865","https://openalex.org/W4398152753","https://openalex.org/W4408352314"],"related_works":[],"abstract_inverted_index":{"Controlling":[0],"the":[1],"spatial":[2],"and":[3,13,21,75,105,141,150],"stylistic":[4],"characteristics":[5],"of":[6],"synthesized":[7],"speech":[8,72,125],"is":[9],"essential":[10],"for":[11,99],"immersive":[12,71],"personalized":[14],"applications":[15],"such":[16],"as":[17],"virtual":[18],"reality,":[19],"gaming,":[20],"human-computer":[22],"interaction.":[23],"While":[24],"recent":[25],"Text-to-speech":[26,57],"(TTS)":[27],"systems":[28],"have":[29],"explored":[30],"multi-modal":[31],"conditioning,":[32],"they":[33],"often":[34],"suffer":[35],"from":[36,73],"poor":[37],"reverberation":[38],"fidelity":[39],"or":[40],"degraded":[41],"audio":[42],"quality":[43],"due":[44],"to":[45,90,109],"reliance":[46],"on":[47],"external":[48],"vocoders.":[49],"In":[50],"this":[51],"paper,":[52],"we":[53],"propose":[54],"Image-indicated":[55],"Immersive":[56],"Synthesis":[58],"(<tex":[59],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[60,120],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$\\mathbf{I}^{\\mathbf{2}}$</tex>":[61],"TTS),":[62],"an":[63,87],"end-to-end":[64],"multimodal":[65],"TTS":[66,122],"framework":[67],"that":[68,135],"synthesizes":[69],"high-quality,":[70],"text":[74],"visual":[76],"scene":[77],"prompts.":[78],"Our":[79],"model":[80],"leverages":[81],"a":[82,94,106,116,129],"CLIP-based":[83],"image":[84],"encoder":[85,108],"with":[86],"adaptive":[88],"adapter":[89],"extract":[91],"scene-aware":[92],"features,":[93],"Speech":[95],"Reverberation":[96],"Classifier":[97],"(SRC)":[98],"refining":[100],"acoustic-visual":[101],"alignment":[102],"during":[103],"training,":[104],"speaker":[107,112],"enable":[110],"zero-shot":[111],"generalization.":[113],"Built":[114],"upon":[115],"VITS":[117],"backbone,":[118],"I<sup":[119],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup>":[121],"generates":[123],"reverberant":[124],"directly":[126],"without":[127],"requiring":[128],"separate":[130],"vocoder.":[131],"Experimental":[132],"results":[133],"demonstrate":[134],"our":[136],"approach":[137],"produces":[138],"spatially":[139],"accurate":[140],"natural-sounding":[142],"speech,":[143],"achieving":[144],"superior":[145],"performance":[146],"in":[147],"both":[148],"subjective":[149],"objective":[151],"evaluations.":[152],"Project":[153],"demo":[154],"page:":[155],"https://spatialTTS.github.io/":[156]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-11-28T00:00:00"}
