{"id":"https://openalex.org/W4403780823","doi":"https://doi.org/10.1145/3664647.3681472","title":"Auto-ACD: A Large-scale Dataset for Audio-Language Representation Learning","display_name":"Auto-ACD: A Large-scale Dataset for Audio-Language Representation Learning","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4403780823","doi":"https://doi.org/10.1145/3664647.3681472"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3681472","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681472","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5104037991","display_name":"Luoyi Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]},{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Luoyi Sun","raw_affiliation_strings":["CMIC, Shanghai Jiao Tong University &amp; Shanghai Artificial Intelligence Laboratory, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"CMIC, Shanghai Jiao Tong University &amp; Shanghai Artificial Intelligence Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I183067930","https://openalex.org/I4391012619"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025827045","display_name":"Xuenan Xu","orcid":"https://orcid.org/0000-0001-8718-1278"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuenan Xu","raw_affiliation_strings":["X-LANCE, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"X-LANCE, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081865665","display_name":"Mengyue Wu","orcid":"https://orcid.org/0000-0002-5599-8707"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengyue Wu","raw_affiliation_strings":["X-LANCE, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"X-LANCE, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5053549250","display_name":"Weidi Xie","orcid":"https://orcid.org/0009-0002-8609-6826"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]},{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weidi Xie","raw_affiliation_strings":["CMIC, Shanghai Jiao Tong University &amp; Shanghai Artificial Intelligence Laboratory, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"CMIC, Shanghai Jiao Tong University &amp; Shanghai Artificial Intelligence Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I183067930","https://openalex.org/I4391012619"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5104037991"],"corresponding_institution_ids":["https://openalex.org/I183067930","https://openalex.org/I4391012619"],"apc_list":null,"apc_paid":null,"fwci":5.6255,"has_fulltext":false,"cited_by_count":15,"citation_normalized_percentile":{"value":0.96857265,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"5025","last_page":"5034"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.987500011920929,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8184079527854919},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.7095221281051636},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6661248803138733},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4542352557182312},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.44854941964149475},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.43389034271240234},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3523784875869751},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.049873292446136475},{"id":"https://openalex.org/keywords/cartography","display_name":"Cartography","score":0.04840201139450073}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8184079527854919},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.7095221281051636},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6661248803138733},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4542352557182312},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.44854941964149475},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.43389034271240234},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3523784875869751},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.049873292446136475},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.04840201139450073},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3664647.3681472","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681472","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W2038484192","https://openalex.org/W2108598243","https://openalex.org/W2133824856","https://openalex.org/W2150824314","https://openalex.org/W2526050071","https://openalex.org/W2593116425","https://openalex.org/W2600463316","https://openalex.org/W2619697695","https://openalex.org/W2732026016","https://openalex.org/W2750779823","https://openalex.org/W2793286954","https://openalex.org/W2949376505","https://openalex.org/W2962960500","https://openalex.org/W2984008963","https://openalex.org/W3010874390","https://openalex.org/W3015371781","https://openalex.org/W3015591594","https://openalex.org/W3037309139","https://openalex.org/W3094550259","https://openalex.org/W3103022576","https://openalex.org/W3170088426","https://openalex.org/W3176445421","https://openalex.org/W3208681643","https://openalex.org/W4210913346","https://openalex.org/W4226109839","https://openalex.org/W4226442948","https://openalex.org/W4280567182","https://openalex.org/W4284898017","https://openalex.org/W4296594718","https://openalex.org/W4306820534","https://openalex.org/W4312926266","https://openalex.org/W4312933868","https://openalex.org/W4313014461","https://openalex.org/W4313123347","https://openalex.org/W4372260241","https://openalex.org/W4372260310","https://openalex.org/W4372263379","https://openalex.org/W4382237564","https://openalex.org/W4386066097","https://openalex.org/W4386066416","https://openalex.org/W4386072368","https://openalex.org/W4386075532","https://openalex.org/W4387969422","https://openalex.org/W4387969495","https://openalex.org/W4390874338","https://openalex.org/W4392904679","https://openalex.org/W4393160420","https://openalex.org/W4394625876"],"related_works":["https://openalex.org/W2062195135","https://openalex.org/W2795079307","https://openalex.org/W2793058541","https://openalex.org/W1983629434","https://openalex.org/W2055929693","https://openalex.org/W4324271173","https://openalex.org/W1967645776","https://openalex.org/W2352227742","https://openalex.org/W4390679071","https://openalex.org/W3006966347"],"abstract_inverted_index":{"Recently,":[0],"the":[1,29,117,123,126],"AI":[2],"community":[3],"has":[4],"made":[5],"significant":[6],"strides":[7],"in":[8,28],"developing":[9],"powerful":[10],"foundation":[11],"models,":[12],"driven":[13],"by":[14,116],"large-scale":[15],"multimodal":[16,55],"datasets.":[17],"However,":[18],"for":[19,100,112,145,167],"audio":[20,43,61,98,149],"representation":[21],"learning,":[22],"existing":[23],"datasets":[24],"suffer":[25],"from":[26],"limitations":[27],"following":[30],"aspects:":[31],"insufficient":[32],"volume,":[33],"simplistic":[34],"content,":[35],"and":[36,137,163],"arduous":[37],"collection":[38],"procedures.":[39],"To":[40,121],"establish":[41,156],"an":[42,50],"dataset":[44,136],"with":[45,160],"high-quality":[46],"captions,":[47,94],"we":[48,64,104,129,155],"propose":[49],"innovative,":[51],"automatic":[52],"approach":[53],"leveraging":[54],"inputs,":[56],"such":[57],"as":[58,72],"video":[59],"frames,":[60],"streams.":[62],"Specifically,":[63],"construct":[65],"a":[66,81,109,157,165],"large-scale,":[67],"high-quality,":[68],"audio-language":[69,147],"dataset,":[70,128],"named":[71],"Auto-ACD,":[73],"comprising":[74],"over":[75],"1.5M":[76],"audio-text":[77,168],"pairs.":[78],"We":[79],"exploit":[80],"series":[82],"of":[83,125],"pre-trained":[84],"models":[85,133],"or":[86,97],"APIs,":[87],"to":[88,107],"determine":[89],"audio-visual":[90],"synchronisation,":[91],"generate":[92],"image":[93],"object":[95],"detection,":[96],"tags":[99],"specific":[101],"videos.":[102],"Subsequently,":[103],"employ":[105],"LLM":[106],"paraphrase":[108],"congruent":[110],"caption":[111],"each":[113],"audio,":[114],"guided":[115],"extracted":[118],"multi-modality":[119],"clues.":[120],"demonstrate":[122],"effectiveness":[124],"proposed":[127],"train":[130],"widely":[131],"used":[132],"on":[134,141],"our":[135],"show":[138],"performance":[139],"improvement":[140],"various":[142],"downstream":[143],"tasks,":[144],"example,":[146],"retrieval,":[148],"captioning,":[150],"zero-shot":[151],"classification.":[152],"In":[153],"addition,":[154],"novel":[158],"benchmark":[159,166],"environmental":[161],"information":[162],"provide":[164],"tasks.":[169]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":10},{"year":2024,"cited_by_count":4}],"updated_date":"2025-12-27T23:08:20.325037","created_date":"2025-10-10T00:00:00"}
