{"id":"https://openalex.org/W4392909638","doi":"https://doi.org/10.1109/icassp48485.2024.10447384","title":"Enhancing Audio Generation Diversity with Visual Information","display_name":"Enhancing Audio Generation Diversity with Visual Information","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392909638","doi":"https://doi.org/10.1109/icassp48485.2024.10447384"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10447384","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10447384","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5000223918","display_name":"Zeyu Xie","orcid":"https://orcid.org/0009-0001-9546-3301"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zeyu Xie","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111141215","display_name":"Baihan Li","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Baihan Li","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025827045","display_name":"Xuenan Xu","orcid":"https://orcid.org/0000-0001-8718-1278"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuenan Xu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081865665","display_name":"Mengyue Wu","orcid":"https://orcid.org/0000-0002-5599-8707"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengyue Wu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043098653","display_name":"Kai Yu","orcid":"https://orcid.org/0000-0002-7102-9826"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Yu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence X-LANCE Lab,Department of Computer Science and Engineering AI Institute,Shanghai,China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Department of Computer Science and Engineering AI Institute, MoE Key Lab of Artificial Intelligence X-LANCE Lab, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5000223918"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.7076,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.64401655,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"866","last_page":"870"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9955000281333923,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9945999979972839,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.8386228680610657},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7476315498352051},{"id":"https://openalex.org/keywords/diversity","display_name":"Diversity (politics)","score":0.6297519207000732},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.561306357383728},{"id":"https://openalex.org/keywords/homogeneous","display_name":"Homogeneous","score":0.510052502155304},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.4825567603111267},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3790719509124756}],"concepts":[{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.8386228680610657},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7476315498352051},{"id":"https://openalex.org/C2781316041","wikidata":"https://www.wikidata.org/wiki/Q1230584","display_name":"Diversity (politics)","level":2,"score":0.6297519207000732},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.561306357383728},{"id":"https://openalex.org/C66882249","wikidata":"https://www.wikidata.org/wiki/Q169336","display_name":"Homogeneous","level":2,"score":0.510052502155304},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.4825567603111267},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3790719509124756},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C19165224","wikidata":"https://www.wikidata.org/wiki/Q23404","display_name":"Anthropology","level":1,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10447384","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10447384","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320337504","display_name":"Research and Development","ror":"https://ror.org/027s68j25"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W2935170919","https://openalex.org/W2972478942","https://openalex.org/W3092028330","https://openalex.org/W3214281017","https://openalex.org/W4226125322","https://openalex.org/W4288099666","https://openalex.org/W4300980117","https://openalex.org/W4312048190","https://openalex.org/W4318718996","https://openalex.org/W4318752004","https://openalex.org/W4367359628","https://openalex.org/W4372340947","https://openalex.org/W4385245566","https://openalex.org/W4386601626","https://openalex.org/W4386763869","https://openalex.org/W4387969125","https://openalex.org/W4390873030","https://openalex.org/W6783867762","https://openalex.org/W6791353385","https://openalex.org/W6810940779","https://openalex.org/W6840815571","https://openalex.org/W6845479124","https://openalex.org/W6848208918","https://openalex.org/W6849109464","https://openalex.org/W6849416043","https://openalex.org/W6852019497"],"related_works":["https://openalex.org/W2271369634","https://openalex.org/W3147472394","https://openalex.org/W2047100085","https://openalex.org/W2350550760","https://openalex.org/W578794879","https://openalex.org/W2625296515","https://openalex.org/W3137890128","https://openalex.org/W1984634519","https://openalex.org/W4245955731","https://openalex.org/W2393726419"],"abstract_inverted_index":{"Audio":[0,108],"and":[1],"sound":[2],"generation":[3,41,106],"has":[4,24],"garnered":[5],"significant":[6],"attention":[7],"in":[8,85],"recent":[9],"years,":[10],"with":[11,70],"a":[12,54,75],"primary":[13],"focus":[14],"on":[15,28,94],"improving":[16,64],"the":[17,30,65,83],"quality":[18],"of":[19,32,67],"generated":[20,33,68],"audios.":[21],"However,":[22],"there":[23],"been":[25],"limited":[26],"research":[27],"enhancing":[29],"diversity":[31,66],"audio,":[34],"particularly":[35],"when":[36],"it":[37],"comes":[38],"to":[39,48,59,81],"audio":[40,51,69,88,105],"within":[42,53,90],"specific":[43],"categories.":[44],"Current":[45],"models":[46],"tend":[47],"produce":[49],"homogeneous":[50],"samples":[52,109],"category.":[55,92],"This":[56],"work":[57],"aims":[58],"address":[60],"this":[61],"limitation":[62],"by":[63],"visual":[71,79,100],"information.":[72],"We":[73],"propose":[74],"clustering-based":[76],"method,":[77],"leveraging":[78],"information":[80],"guide":[82],"model":[84],"generating":[86],"distinct":[87],"content":[89],"each":[91],"Results":[93],"seven":[95],"categories":[96],"indicate":[97],"that":[98],"extra":[99],"input":[101],"can":[102],"largely":[103],"enhance":[104],"diversity.":[107],"are":[110],"available":[111],"at":[112],"DemoWeb.":[113]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-26T15:22:09.906841","created_date":"2025-10-10T00:00:00"}
