{"id":"https://openalex.org/W4408353895","doi":"https://doi.org/10.1109/icassp49660.2025.10890285","title":"From Pixels to Voice: A Simple and Efficient End-to-End Spoken Image Description Approach via Vision Codec Language Models","display_name":"From Pixels to Voice: A Simple and Efficient End-to-End Spoken Image Description Approach via Vision Codec Language Models","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408353895","doi":"https://doi.org/10.1109/icassp49660.2025.10890285"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10890285","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890285","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5104156090","display_name":"Chung Tran","orcid":"https://orcid.org/0000-0003-1268-3630"},"institutions":[{"id":"https://openalex.org/I75917431","display_name":"Nara Institute of Science and Technology","ror":"https://ror.org/05bhada84","country_code":"JP","type":"education","lineage":["https://openalex.org/I75917431"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Chung Tran","raw_affiliation_strings":["Nara Institute of Science and Technology,Graduate School of Science and Technology,Ikoma,Japan"],"affiliations":[{"raw_affiliation_string":"Nara Institute of Science and Technology,Graduate School of Science and Technology,Ikoma,Japan","institution_ids":["https://openalex.org/I75917431"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040108974","display_name":"Sakriani Sakti","orcid":"https://orcid.org/0000-0001-5509-8963"},"institutions":[{"id":"https://openalex.org/I75917431","display_name":"Nara Institute of Science and Technology","ror":"https://ror.org/05bhada84","country_code":"JP","type":"education","lineage":["https://openalex.org/I75917431"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Sakriani Sakti","raw_affiliation_strings":["Nara Institute of Science and Technology,Graduate School of Science and Technology,Ikoma,Japan"],"affiliations":[{"raw_affiliation_string":"Nara Institute of Science and Technology,Graduate School of Science and Technology,Ikoma,Japan","institution_ids":["https://openalex.org/I75917431"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5104156090"],"corresponding_institution_ids":["https://openalex.org/I75917431"],"apc_list":null,"apc_paid":null,"fwci":1.319,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.77969118,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9878000020980835,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9878000020980835,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9825999736785889,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9614999890327454,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8027268648147583},{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.7140249609947205},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.6676720380783081},{"id":"https://openalex.org/keywords/pixel","display_name":"Pixel","score":0.5655105710029602},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5415762662887573},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5268558263778687},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.4993405342102051},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4938521981239319},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.46302762627601624},{"id":"https://openalex.org/keywords/spoken-language","display_name":"Spoken language","score":0.43180227279663086},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.1315082311630249}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8027268648147583},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.7140249609947205},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.6676720380783081},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.5655105710029602},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5415762662887573},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5268558263778687},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.4993405342102051},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4938521981239319},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.46302762627601624},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.43180227279663086},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.1315082311630249},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10890285","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890285","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6000000238418579,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W1956340063","https://openalex.org/W2101105183","https://openalex.org/W2154652894","https://openalex.org/W2752796333","https://openalex.org/W2962862718","https://openalex.org/W2964243274","https://openalex.org/W3096216486","https://openalex.org/W3155217823","https://openalex.org/W3161053832","https://openalex.org/W3174311593","https://openalex.org/W4307323391","https://openalex.org/W4385245566","https://openalex.org/W4392904292","https://openalex.org/W6610006008","https://openalex.org/W6678262379","https://openalex.org/W6757817989","https://openalex.org/W6770596778","https://openalex.org/W6790019176","https://openalex.org/W6811013733","https://openalex.org/W6848735303","https://openalex.org/W6850334629","https://openalex.org/W6853096648","https://openalex.org/W6853611000","https://openalex.org/W6857968694"],"related_works":["https://openalex.org/W2151749779","https://openalex.org/W2964213236","https://openalex.org/W2163719598","https://openalex.org/W3161919736","https://openalex.org/W2387018512","https://openalex.org/W2107680156","https://openalex.org/W3179968364","https://openalex.org/W4301184752","https://openalex.org/W2288771647","https://openalex.org/W4404782863"],"abstract_inverted_index":{"Neural":[0],"audio":[1,9,34,93],"codecs":[2,94],"provide":[3],"a":[4,26,81,125],"powerful":[5],"tool":[6],"for":[7,66,137],"compressing":[8],"signals":[10],"into":[11],"discrete":[12,17],"codec":[13],"representations.":[14],"This":[15],"compact":[16],"representation":[18],"has":[19],"made":[20],"it":[21],"possible":[22],"to":[23,32,95,113,123],"successfully":[24],"apply":[25],"natural":[27],"language":[28],"processing":[29,37,67],"(NLP)":[30],"model":[31],"various":[33],"and":[35,45,62,83],"speech":[36,129,146],"tasks,":[38],"including":[39],"text-to-speech":[40],"(e.g.,":[41,49],"VALL-E,":[42],"VALL-E":[43],"X)":[44],"multimodal":[46],"audio-text":[47],"generation":[48],"LauraGPT,":[50],"VioLA).":[51],"While":[52],"these":[53],"models":[54,90],"excel":[55],"at":[56],"handling":[57],"sequential":[58],"data":[59],"like":[60],"text":[61],"speech,":[63],"their":[64],"potential":[65],"non-sequential":[68],"data,":[69],"such":[70,144],"as":[71,145],"images,":[72],"remains":[73],"unexplored.":[74],"In":[75],"this":[76,118],"paper,":[77],"we":[78],"introduce":[79],"PixVoxLM,":[80],"simple":[82],"efficient":[84],"end-to-end":[85],"framework":[86],"that":[87,107],"combines":[88],"vision-language":[89],"with":[91],"neural":[92],"tackle":[96],"the":[97,103,121,135],"Image-to-Speech":[98],"(I2S)":[99],"problem.":[100],"Experiments":[101],"on":[102],"Flickr8k":[104],"dataset":[105],"demonstrate":[106],"PixVoxLM":[108],"delivers":[109],"promising":[110],"results":[111],"compared":[112],"existing":[114],"I2S":[115,132],"methods.":[116],"Furthermore,":[117],"research":[119],"is":[120],"first":[122],"explore":[124],"new":[126,138],"capability:":[127],"visual-guided":[128],"completion":[130],"in":[131,141],"model,":[133],"paving":[134],"way":[136],"practical":[139],"applications":[140],"everyday":[142],"communication,":[143],"prompt-based":[147],"instruction.":[148]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-02-25T08:12:03.925757","created_date":"2025-10-10T00:00:00"}
