{"id":"https://openalex.org/W4416016499","doi":"https://doi.org/10.1145/3746252.3761366","title":"Hearable Image: On-Device Image-Driven Sound Effect Generation for Hearing What You See","display_name":"Hearable Image: On-Device Image-Driven Sound Effect Generation for Hearing What You See","publication_year":2025,"publication_date":"2025-11-08","ids":{"openalex":"https://openalex.org/W4416016499","doi":"https://doi.org/10.1145/3746252.3761366"},"language":null,"primary_location":{"id":"doi:10.1145/3746252.3761366","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761366","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746252.3761366","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5092636471","display_name":"Deokjun Eom","orcid":"https://orcid.org/0009-0000-6060-2554"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Deokjun Eom","raw_affiliation_strings":["Samsung Research, Samsung Electronics, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0000-6060-2554","affiliations":[{"raw_affiliation_string":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084937334","display_name":"N.H. Kim","orcid":"https://orcid.org/0009-0005-5856-767X"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Nahyun Kim","raw_affiliation_strings":["Samsung Research, Samsung Electronics, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0005-5856-767X","affiliations":[{"raw_affiliation_string":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040047837","display_name":"Woo-Hyun Nam","orcid":"https://orcid.org/0000-0001-5328-6610"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Woohyun Nam","raw_affiliation_strings":["Samsung Research, Samsung Electronics, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0000-0001-5328-6610","affiliations":[{"raw_affiliation_string":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104726828","display_name":"Kyung-Rae Kim","orcid":"https://orcid.org/0009-0000-6934-9698"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Kyung-Rae Kim","raw_affiliation_strings":["Samsung Research, Samsung Electronics, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0000-6934-9698","affiliations":[{"raw_affiliation_string":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091556710","display_name":"Choong-Youl Im","orcid":"https://orcid.org/0000-0002-7326-5646"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Chaebin Im","raw_affiliation_strings":["Samsung Research, Samsung Electronics, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0000-0002-7326-5646","affiliations":[{"raw_affiliation_string":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"last","author":{"id":null,"display_name":"Jungwon Park","orcid":"https://orcid.org/0009-0002-3722-0659"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jungwon Park","raw_affiliation_strings":["Samsung Research, Samsung Electronics, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0002-3722-0659","affiliations":[{"raw_affiliation_string":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I2250650973"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5092636471"],"corresponding_institution_ids":["https://openalex.org/I2250650973"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.31698609,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"635","last_page":"644"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.5020999908447266,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.5020999908447266,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.0925000011920929,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.08590000122785568,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sound-quality","display_name":"Sound quality","score":0.5077999830245972},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.4936000108718872},{"id":"https://openalex.org/keywords/sound","display_name":"Sound (geography)","score":0.42579999566078186},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.4146000146865845},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.38920000195503235},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.3596000075340271},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.3582000136375427},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.35120001435279846}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7293999791145325},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5861999988555908},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.5077999830245972},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.4936000108718872},{"id":"https://openalex.org/C203718221","wikidata":"https://www.wikidata.org/wiki/Q491713","display_name":"Sound (geography)","level":2,"score":0.42579999566078186},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.4146000146865845},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.38920000195503235},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3596000075340271},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3582000136375427},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.35120001435279846},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3314000070095062},{"id":"https://openalex.org/C128422554","wikidata":"https://www.wikidata.org/wiki/Q20077126","display_name":"Sound recording and reproduction","level":2,"score":0.3255000114440918},{"id":"https://openalex.org/C93240960","wikidata":"https://www.wikidata.org/wiki/Q217270","display_name":"Acoustic source localization","level":3,"score":0.2915000021457672},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2833000123500824},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.2815999984741211},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.2800999879837036},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2784000039100647},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.27379998564720154},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.25099998712539673},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746252.3761366","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761366","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3746252.3761366","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761366","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W2108598243","https://openalex.org/W2593116425","https://openalex.org/W2972478942","https://openalex.org/W3015371781","https://openalex.org/W3094550259","https://openalex.org/W3176445421","https://openalex.org/W4312933868","https://openalex.org/W4372260261","https://openalex.org/W4386071707","https://openalex.org/W4387969125","https://openalex.org/W4392903114","https://openalex.org/W4392908114","https://openalex.org/W4393147260","https://openalex.org/W4398226295","https://openalex.org/W4403780831","https://openalex.org/W4408352247","https://openalex.org/W4413145987","https://openalex.org/W4415796728"],"related_works":[],"abstract_inverted_index":{"There":[0],"have":[1,17],"been":[2],"various":[3,57],"studies":[4],"in":[5,67,76,208],"audio":[6,23,85,117,159,184],"generation":[7,24,48,101,106,118,125,241,245],"from":[8,89,139,158],"image,":[9,193],"text,":[10],"or":[11],"video.":[12],"However,":[13],"the":[14,163,203,214,235],"existing":[15],"approaches":[16],"not":[18],"consider":[19],"on-device":[20,98,239],"environment":[21],"because":[22,50,211],"models":[25],"are":[26,64,74,87],"computationally":[27],"expensive":[28],"and":[29,69,115,131,171,178,188,221,247],"require":[30],"heavy":[31],"storage":[32],"capacity":[33],"to":[34,45,112,135,148,162,175,252],"save":[35],"large":[36,253],"number":[37,223],"of":[38,191,199,224],"weights.":[39],"In":[40,60,91],"addition,":[41],"it":[42,80],"is":[43,81,104,258],"difficult":[44],"get":[46],"stable":[47,123],"outputs":[49],"unexpected":[51],"results":[52],"may":[53],"occur":[54],"depending":[55,182],"on":[56,108,183],"model":[58,216,225],"inputs.":[59],"image-to-audio":[61,105],"generation,":[62],"there":[63],"diverse":[65,179],"images":[66],"smartphones,":[68],"too":[70],"many":[71],"visual":[72],"contexts":[73],"contained":[75],"image":[77,146],"features.":[78],"Therefore,":[79],"sometimes":[82],"unpredictable":[83,116],"which":[84],"categories":[86,198],"generated":[88],"images.":[90],"this":[92],"paper,":[93],"we":[94,120,167,194,212],"propose":[95,121,168],"a":[96,122],"robust":[97,177],"sound":[99,124,137,141,149,156,180,200,240],"effect":[100,142,150],"framework":[102,126,153,204,237],"that":[103],"based":[107],"latent":[109,229],"diffusion.":[110],"First,":[111],"avoid":[113],"unstable":[114],"results,":[119],"with":[127,217,243],"Audio":[128],"Feature":[129],"Dictionary":[130],"Audio-Image":[132],"Matching":[133],"Pipeline":[134],"generate":[136,155,176],"effects":[138,157,181],"predefined":[140],"categories.":[143,165,185],"If":[144],"an":[145,192],"matches":[147],"categories,":[151],"proposed":[152,215,236],"directly":[154],"features":[160,190],"corresponding":[161],"matched":[164],"Second,":[166],"Multi-Category":[169],"Generation":[170,172],"Flow":[173],"Map":[174],"Using":[186],"global":[187],"local":[189],"can":[195,205],"select":[196],"multiple":[197],"effects.":[201],"Third,":[202],"be":[206],"implemented":[207],"smartphone":[209],"devices":[210],"train":[213],"low":[218],"computational":[219],"cost":[220],"small":[222],"weights":[226],"under":[227],"4-step":[228],"diffusion":[230],"inference.":[231],"Various":[232],"experiments":[233],"show":[234],"solves":[238],"problem":[242],"maintaining":[244],"quality":[246],"audio-image":[248],"matching":[249],"performances":[250],"compared":[251],"scale":[254],"models.":[255],"Our":[256],"demo":[257],"available":[259],"at:":[260],"https://youtu.be/Y5HTr8wwqOA.":[261]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-11-08T00:00:00"}
