{"id":"https://openalex.org/W4392903242","doi":"https://doi.org/10.1109/icassp48485.2024.10446302","title":"Multiscale Matching Driven by Cross-Modal Similarity Consistency for Audio-Text Retrieval","display_name":"Multiscale Matching Driven by Cross-Modal Similarity Consistency for Audio-Text Retrieval","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392903242","doi":"https://doi.org/10.1109/icassp48485.2024.10446302"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446302","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446302","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100391126","display_name":"Qian Wang","orcid":"https://orcid.org/0000-0002-9908-4317"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Qian Wang","raw_affiliation_strings":["University of Science and Technology of China,National Engineering Research Center of Speech and Language Information Processing,Hefei,P.R.China","National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P.R.China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,National Engineering Research Center of Speech and Language Information Processing,Hefei,P.R.China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P.R.China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005572782","display_name":"Jia-Chen Gu","orcid":"https://orcid.org/0000-0002-8801-1438"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jia-Chen Gu","raw_affiliation_strings":["University of Science and Technology of China,National Engineering Research Center of Speech and Language Information Processing,Hefei,P.R.China","National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P.R.China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,National Engineering Research Center of Speech and Language Information Processing,Hefei,P.R.China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P.R.China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5059767940","display_name":"Zhen-Hua Ling","orcid":"https://orcid.org/0000-0001-7853-5273"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhen-Hua Ling","raw_affiliation_strings":["University of Science and Technology of China,National Engineering Research Center of Speech and Language Information Processing,Hefei,P.R.China","National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P.R.China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,National Engineering Research Center of Speech and Language Information Processing,Hefei,P.R.China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P.R.China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100391126"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.7536,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.65288858,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"11581","last_page":"11585"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9901000261306763,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.98580002784729,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.777948260307312},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.7276573181152344},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.7134659290313721},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.6676544547080994},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6515958309173584},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.5840204954147339},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5185645222663879},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5121902823448181},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3760118782520294},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3341570794582367},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.16920161247253418},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09367913007736206}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.777948260307312},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.7276573181152344},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.7134659290313721},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.6676544547080994},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6515958309173584},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.5840204954147339},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5185645222663879},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5121902823448181},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3760118782520294},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3341570794582367},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.16920161247253418},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09367913007736206},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446302","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446302","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4300000071525574,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W2593116425","https://openalex.org/W2896457183","https://openalex.org/W2962964995","https://openalex.org/W2970231061","https://openalex.org/W2988823324","https://openalex.org/W3015591594","https://openalex.org/W3021397474","https://openalex.org/W3176821361","https://openalex.org/W3212456749","https://openalex.org/W4210913346","https://openalex.org/W4221157007","https://openalex.org/W4224933373","https://openalex.org/W4226442948","https://openalex.org/W4372260310","https://openalex.org/W4372260330","https://openalex.org/W4372266552","https://openalex.org/W4377164418","https://openalex.org/W4384661029","https://openalex.org/W4385245566","https://openalex.org/W4385822467","https://openalex.org/W6631190155","https://openalex.org/W6682889407","https://openalex.org/W6774314701","https://openalex.org/W6791353385","https://openalex.org/W6803953248","https://openalex.org/W6840200333","https://openalex.org/W6843342451","https://openalex.org/W6852934099"],"related_works":["https://openalex.org/W73545470","https://openalex.org/W4224266612","https://openalex.org/W2383394264","https://openalex.org/W4320153225","https://openalex.org/W4293261942","https://openalex.org/W3125968744","https://openalex.org/W203959209","https://openalex.org/W2110287964","https://openalex.org/W2167701463","https://openalex.org/W4307407935"],"abstract_inverted_index":{"Audio-text":[0],"retrieval":[1],"(ATR),":[2],"which":[3],"retrieves":[4],"a":[5,32,81,101,108,113],"relevant":[6],"caption":[7],"given":[8],"an":[9],"audio":[10],"clip":[11],"(A2T)":[12,169,180],"and":[13,42,49,60,97,175],"vice":[14],"versa":[15],"(T2A),":[16],"has":[17],"recently":[18],"attracted":[19],"much":[20],"research":[21],"attention.":[22],"Existing":[23],"methods":[24,158],"typically":[25],"aggregate":[26],"information":[27,93],"from":[28,94,116],"each":[29],"modality":[30],"into":[31],"single":[33],"vector":[34],"for":[35],"matching,":[36],"but":[37],"this":[38],"sacrifices":[39],"local":[40,117],"details":[41],"can":[43],"hardly":[44],"capture":[45,122],"intricate":[46,146],"relationships":[47,90,139],"within":[48],"between":[50,73],"modalities.":[51],"Furthermore,":[52],"current":[53],"ATR":[54,83],"datasets":[55],"lack":[56],"comprehensive":[57],"alignment":[58,103],"information,":[59],"simple":[61],"binary":[62],"contrastive":[63],"learning":[64],"labels":[65],"overlook":[66],"the":[67,88,130,151,172,183],"measurement":[68],"of":[69,91,132,153,162],"fine-grained":[70,102],"semantic":[71],"differences":[72],"samples.":[74],"To":[75],"counter":[76],"these":[77],"challenges,":[78],"we":[79,128],"present":[80],"novel":[82],"framework":[84],"that":[85],"comprehensively":[86],"captures":[87],"matching":[89,111],"multimodal":[92],"different":[95],"perspectives":[96],"finer":[98],"granularities.":[99],"Specifically,":[100],"method":[104],"is":[105],"introduced,":[106],"achieving":[107],"more":[109,145],"detail-oriented":[110],"through":[112],"multiscale":[114],"process":[115],"to":[118,121,143],"global":[119],"levels":[120],"meticulous":[123],"cross-modal":[124,133],"relationships.":[125],"In":[126],"addition,":[127],"pioneer":[129],"application":[131],"similarity":[134,138],"consistency,":[135],"leveraging":[136],"intra-modal":[137],"as":[140],"soft":[141],"supervision":[142],"boost":[144],"alignment.":[147],"Extensive":[148],"experiments":[149],"validate":[150],"effectiveness":[152],"our":[154],"approach,":[155],"outperforming":[156],"previous":[157],"by":[159],"significant":[160],"margins":[161],"at":[163],"least":[164],"3.9%":[165],"(T2A)":[166,177],"/":[167,178],"6.9%":[168],"R@1":[170,181],"on":[171,182],"AudioCaps":[173],"dataset":[174],"2.9%":[176],"5.4%":[179],"Clotho":[184],"dataset.":[185]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
