{"id":"https://openalex.org/W4387969422","doi":"https://doi.org/10.1145/3581783.3613820","title":"BLAT: Bootstrapping Language-Audio Pre-training based on AudioSet Tag-guided Synthetic Data","display_name":"BLAT: Bootstrapping Language-Audio Pre-training based on AudioSet Tag-guided Synthetic Data","publication_year":2023,"publication_date":"2023-10-26","ids":{"openalex":"https://openalex.org/W4387969422","doi":"https://doi.org/10.1145/3581783.3613820"},"language":"en","primary_location":{"id":"doi:10.1145/3581783.3613820","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3613820","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025827045","display_name":"Xuenan Xu","orcid":"https://orcid.org/0000-0001-8718-1278"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xuenan Xu","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036931883","display_name":"Zhiling Zhang","orcid":"https://orcid.org/0000-0002-8081-704X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiling Zhang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101400743","display_name":"Zelin Zhou","orcid":"https://orcid.org/0009-0002-0624-6266"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zelin Zhou","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046257318","display_name":"Pingyue Zhang","orcid":"https://orcid.org/0000-0002-5884-632X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pingyue Zhang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000223918","display_name":"Zeyu Xie","orcid":"https://orcid.org/0009-0001-9546-3301"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zeyu Xie","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081865665","display_name":"Mengyue Wu","orcid":"https://orcid.org/0000-0002-5599-8707"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengyue Wu","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101906756","display_name":"Kenny Q. Zhu","orcid":"https://orcid.org/0000-0003-3782-3230"},"institutions":[{"id":"https://openalex.org/I189196454","display_name":"The University of Texas at Arlington","ror":"https://ror.org/019kgqr73","country_code":"US","type":"education","lineage":["https://openalex.org/I189196454"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kenny Q. Zhu","raw_affiliation_strings":["University of Texas at Arlington, Arlington, TX, USA"],"affiliations":[{"raw_affiliation_string":"University of Texas at Arlington, Arlington, TX, USA","institution_ids":["https://openalex.org/I189196454"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5025827045"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":1.1919,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.81532278,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"2756","last_page":"2764"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9958000183105469,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8549700379371643},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.6059088110923767},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.5758668184280396},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5504910349845886},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.543258011341095},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.5201807022094727},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.47002625465393066},{"id":"https://openalex.org/keywords/bootstrapping","display_name":"Bootstrapping (finance)","score":0.43874385952949524},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.43172597885131836},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.4208253026008606},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.41185683012008667},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.358803391456604},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.24608278274536133},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.22481641173362732},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.122292160987854},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.09185615181922913}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8549700379371643},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.6059088110923767},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.5758668184280396},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5504910349845886},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.543258011341095},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.5201807022094727},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.47002625465393066},{"id":"https://openalex.org/C207609745","wikidata":"https://www.wikidata.org/wiki/Q4944086","display_name":"Bootstrapping (finance)","level":2,"score":0.43874385952949524},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.43172597885131836},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.4208253026008606},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41185683012008667},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.358803391456604},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.24608278274536133},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.22481641173362732},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.122292160987854},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.09185615181922913},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C106159729","wikidata":"https://www.wikidata.org/wiki/Q2294553","display_name":"Financial economics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3581783.3613820","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3613820","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.7699999809265137,"id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G1121271761","display_name":null,"funder_award_id":"Program","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3085993365","display_name":null,"funder_award_id":"(Grant No.","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3472539505","display_name":null,"funder_award_id":"202205","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G391238517","display_name":null,"funder_award_id":", and","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7726157001","display_name":null,"funder_award_id":"Grant No.","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7857932966","display_name":null,"funder_award_id":"92048205","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320321605","display_name":"Government of Jiangsu Province","ror":"https://ror.org/004svx814"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1650531274","https://openalex.org/W1933349210","https://openalex.org/W2033875152","https://openalex.org/W2277195237","https://openalex.org/W2593116425","https://openalex.org/W2886641317","https://openalex.org/W2963115613","https://openalex.org/W2964213897","https://openalex.org/W2984008963","https://openalex.org/W2994728585","https://openalex.org/W3015371781","https://openalex.org/W3015591594","https://openalex.org/W3036601975","https://openalex.org/W3037309139","https://openalex.org/W3094550259","https://openalex.org/W3154596443","https://openalex.org/W3160577380","https://openalex.org/W3162391496","https://openalex.org/W3163937874","https://openalex.org/W3164279099","https://openalex.org/W3169320628","https://openalex.org/W3176445421","https://openalex.org/W3196974791","https://openalex.org/W3198452188","https://openalex.org/W3201143670","https://openalex.org/W3204363391","https://openalex.org/W3205475937","https://openalex.org/W3205743929","https://openalex.org/W3205860970","https://openalex.org/W3209984917","https://openalex.org/W4205689591","https://openalex.org/W4226109839","https://openalex.org/W4284898017","https://openalex.org/W4287119707","https://openalex.org/W4372266552","https://openalex.org/W6600002382","https://openalex.org/W6783462664"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W3088136942","https://openalex.org/W2949362007","https://openalex.org/W2775506363","https://openalex.org/W4290852288","https://openalex.org/W4388893791","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W2949522393"],"abstract_inverted_index":{"Compared":[0],"with":[1],"ample":[2],"visual-text":[3],"pre-training":[4],"research,":[5],"few":[6],"works":[7],"explore":[8],"audio-text":[9,19,32,96,103,136],"pre-training,":[10,33],"mostly":[11],"due":[12],"to":[13,44,48,82],"the":[14,25,55,58,76,87,108,111,157],"lack":[15],"of":[16,57,78,110,117,135,159],"sufficient":[17],"parallel":[18,95],"data.":[20,162],"Most":[21],"existing":[22],"methods":[23],"incorporate":[24],"visual":[26,59],"modality":[27,60,66],"as":[28,129,131,168],"a":[29,115],"pivot":[30],"for":[31],"which":[34,100],"inevitably":[35],"induces":[36],"data":[37,181,192],"noise.":[38],"In":[39],"this":[40],"paper,":[41],"we":[42,71,91,101],"propose":[43,72],"utilize":[45],"audio":[46,125,164],"captioning":[47],"generate":[49],"text":[50,140],"directly":[51],"from":[52,65],"audio,":[53],"without":[54],"aid":[56],"so":[61],"that":[62,145],"potential":[63],"noise":[64],"mismatch":[67],"is":[68],"eliminated.":[69],"Furthermore,":[70],"caption":[73],"generation":[74],"under":[75],"guidance":[77],"AudioSet":[79],"tags,":[80],"leading":[81],"more":[83],"accurate":[84],"captions.":[85],"With":[86],"above":[88],"two":[89],"improvements,":[90],"curate":[92],"high-quality,":[93],"large-scale":[94],"data,":[97],"based":[98],"on":[99,114,153,177],"perform":[102],"pre-training.":[104],"We":[105],"comprehensively":[106],"demonstrate":[107],"performance":[109,152],"pre-trained":[112,183],"model":[113,173],"series":[116],"downstream":[118],"audio-related":[119,178],"tasks,":[120],"including":[121],"single-modality":[122],"tasks":[123,133],"like":[124],"classification":[126,151],"and":[127,138,182,191,197],"tagging,":[128],"well":[130],"cross-modal":[132],"consisting":[134],"retrieval":[137],"audio-based":[139],"generation.":[141],"Experimental":[142],"results":[143],"indicate":[144],"our":[146,160],"approach":[147],"achieves":[148],"state-of-the-art":[149],"zero-shot":[150],"most":[154],"datasets,":[155],"suggesting":[156],"effectiveness":[158],"synthetic":[161],"The":[163,188],"encoder":[165],"also":[166],"serves":[167],"an":[169],"efficient":[170],"pattern":[171],"recognition":[172],"by":[174],"fine-tuning":[175],"it":[176],"tasks.":[179],"Synthetic":[180],"models":[184],"are":[185,193],"available":[186,194],"online1":[187],"code,":[189],"checkpoints":[190],"at":[195],"https://github.com/wsntxxn/BLAT":[196],"https://zenodo.org/record/8218696/.":[198]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":1}],"updated_date":"2026-04-13T07:58:08.660418","created_date":"2025-10-10T00:00:00"}
