{"id":"https://openalex.org/W4402576566","doi":"https://doi.org/10.1145/3664647.3681145","title":"Advancing Multi-grained Alignment for Contrastive Language-Audio Pre-training","display_name":"Advancing Multi-grained Alignment for Contrastive Language-Audio Pre-training","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4402576566","doi":"https://doi.org/10.1145/3664647.3681145"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3681145","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681145","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3664647.3681145","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yiming Li","orcid":"https://orcid.org/0009-0004-3397-8432"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yiming Li","raw_affiliation_strings":["Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences Beijing, China","Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences & University of Chinese Academy of Sciences Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]},{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]},{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences & University of Chinese Academy of Sciences Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102221957","display_name":"Zhifang Guo","orcid":null},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhifang Guo","raw_affiliation_strings":["Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences & University of Chinese Academy of Sciences Beijing, China","Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]},{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences & University of Chinese Academy of Sciences Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]},{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100402345","display_name":"Xiangdong Wang","orcid":"https://orcid.org/0000-0002-4226-3250"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiangdong Wang","raw_affiliation_strings":["Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences & University of Chinese Academy of Sciences Beijing, China","Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]},{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences & University of Chinese Academy of Sciences Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]},{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100410374","display_name":"Hong Liu","orcid":"https://orcid.org/0000-0003-4524-495X"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hong Liu","raw_affiliation_strings":["Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences & University of Chinese Academy of Sciences Beijing, China","Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]},{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences & University of Chinese Academy of Sciences Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]},{"raw_affiliation_string":"Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210090176","https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":0.6772,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.68047682,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"7356","last_page":"7365"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.991100013256073,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8462329506874084},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.48699837923049927},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.480101615190506},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.47842270135879517},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4623471796512604},{"id":"https://openalex.org/keywords/codebook","display_name":"Codebook","score":0.44924965500831604},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.4485543668270111},{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.441353440284729},{"id":"https://openalex.org/keywords/sign-language","display_name":"Sign language","score":0.42828992009162903},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.37930765748023987},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.10691657662391663}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8462329506874084},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.48699837923049927},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.480101615190506},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.47842270135879517},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4623471796512604},{"id":"https://openalex.org/C127759330","wikidata":"https://www.wikidata.org/wiki/Q637416","display_name":"Codebook","level":2,"score":0.44924965500831604},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.4485543668270111},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.441353440284729},{"id":"https://openalex.org/C522192633","wikidata":"https://www.wikidata.org/wiki/Q34228","display_name":"Sign language","level":2,"score":0.42828992009162903},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.37930765748023987},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.10691657662391663},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3664647.3681145","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681145","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2408.07919","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.07919","pdf_url":"https://arxiv.org/pdf/2408.07919","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3664647.3681145","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681145","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7099999785423279,"display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3744313044","display_name":null,"funder_award_id":"Social","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8955107213","display_name":null,"funder_award_id":"Major","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335869","display_name":"National Social Science Fund of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W1530404542","https://openalex.org/W2038484192","https://openalex.org/W2052666245","https://openalex.org/W2354870669","https://openalex.org/W2509065397","https://openalex.org/W2593116425","https://openalex.org/W2771361008","https://openalex.org/W2781528640","https://openalex.org/W2981852735","https://openalex.org/W2999905431","https://openalex.org/W3006275583","https://openalex.org/W3015371781","https://openalex.org/W3015591594","https://openalex.org/W3017521796","https://openalex.org/W3094550259","https://openalex.org/W3098357269","https://openalex.org/W3103314642","https://openalex.org/W3162999565","https://openalex.org/W3163843406","https://openalex.org/W3176445421","https://openalex.org/W3204267711","https://openalex.org/W3206996142","https://openalex.org/W3215615641","https://openalex.org/W4205689591","https://openalex.org/W4210913346","https://openalex.org/W4224920041","https://openalex.org/W4226442948","https://openalex.org/W4284898017","https://openalex.org/W4285483774","https://openalex.org/W4288089799","https://openalex.org/W4312868939","https://openalex.org/W4372260310","https://openalex.org/W4372260330","https://openalex.org/W4372266552","https://openalex.org/W4385488967","https://openalex.org/W4386076046","https://openalex.org/W4390738640","https://openalex.org/W4390872177","https://openalex.org/W4391021645","https://openalex.org/W4392908945","https://openalex.org/W6840200333"],"related_works":["https://openalex.org/W2293149949","https://openalex.org/W2026099691","https://openalex.org/W4284672201","https://openalex.org/W2377486419","https://openalex.org/W2943202426","https://openalex.org/W2137816434","https://openalex.org/W2163679795","https://openalex.org/W2736714427","https://openalex.org/W2017956276","https://openalex.org/W2048606991"],"abstract_inverted_index":{"Recent":[0],"advances":[1],"have":[2],"been":[3],"witnessed":[4],"in":[5,16,84],"audio-language":[6,82],"joint":[7],"learning,":[8],"such":[9],"as":[10],"CLAP,":[11],"that":[12,159],"shows":[13],"much":[14],"success":[15],"multi-modal":[17,105],"understanding":[18],"tasks.":[19,70],"These":[20],"models":[21],"usually":[22],"aggregate":[23],"uni-modal":[24],"local":[25,138],"representations,":[26],"namely":[27],"frame":[28,124],"or":[29,173],"word":[30,126],"features,":[31],"into":[32],"global":[33,106],"ones,":[34],"on":[35,58,68,129,151],"which":[36,63],"the":[37,90,121,165],"contrastive":[38,86],"loss":[39,144],"is":[40,101,114,134,145],"employed":[41],"to":[42,76,103,116,136,147,177],"reach":[43],"coarse-grained":[44,69],"cross-modal":[45],"alignment.":[46,149],"However,":[47],"frame-level":[48],"correspondence":[49],"with":[50,108],"texts":[51],"may":[52,64],"be":[53],"ignored,":[54],"making":[55],"it":[56],"ill-posed":[57],"explainability":[59],"and":[60,80,92,111,125,140,155],"fine-grained":[61,81,156],"challenges":[62],"also":[65,170],"undermine":[66],"performances":[67],"In":[71],"this":[72],"work,":[73],"we":[74],"aim":[75],"improve":[77],"both":[78],"coarse-":[79,154],"alignment":[83],"large-scale":[85],"pre-training.":[87],"To":[88],"unify":[89],"granularity":[91],"latent":[93],"distribution":[94],"of":[95],"two":[96],"modalities,":[97],"a":[98,131,141],"shared":[99],"codebook":[100],"adopted":[102],"represent":[104],"features":[107],"common":[109],"bases,":[110],"each":[112],"codeword":[113],"regularized":[115],"encode":[117],"modality-shared":[118],"semantics,":[119],"bridging":[120],"gap":[122],"between":[123],"features.":[127],"Based":[128],"it,":[130],"locality-aware":[132],"block":[133],"involved":[135],"purify":[137],"patterns,":[139],"hard-negative":[142],"guided":[143],"devised":[146],"boost":[148],"Experiments":[150],"eleven":[152],"zero-shot":[153],"tasks":[157],"suggest":[158],"our":[160],"model":[161],"not":[162],"only":[163],"surpasses":[164],"baseline":[166],"CLAP":[167],"significantly":[168],"but":[169],"yields":[171],"superior":[172],"competitive":[174],"results":[175],"compared":[176],"current":[178],"SOTA":[179],"works.":[180]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-16T08:26:57.006410","created_date":"2025-10-10T00:00:00"}
