{"id":"https://openalex.org/W4392903757","doi":"https://doi.org/10.1109/icassp48485.2024.10448111","title":"DialCLIP: Empowering Clip As Multi-Modal Dialog Retriever","display_name":"DialCLIP: Empowering Clip As Multi-Modal Dialog Retriever","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392903757","doi":"https://doi.org/10.1109/icassp48485.2024.10448111"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10448111","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10448111","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101495496","display_name":"Zhichao Yin","orcid":"https://orcid.org/0000-0002-3458-7909"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhichao Yin","raw_affiliation_strings":["University of Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080628250","display_name":"Binyuan Hui","orcid":"https://orcid.org/0000-0002-2160-7595"},"institutions":[{"id":"https://openalex.org/I4210086143","display_name":"Alibaba Group (Cayman Islands)","ror":"https://ror.org/00mnrxf72","country_code":"KY","type":"company","lineage":["https://openalex.org/I4210086143","https://openalex.org/I45928872"]}],"countries":["KY"],"is_corresponding":false,"raw_author_name":"Binyuan Hui","raw_affiliation_strings":["DAMO Academy,Alibaba Group","Alibaba Group, DAMO Academy"],"affiliations":[{"raw_affiliation_string":"DAMO Academy,Alibaba Group","institution_ids":["https://openalex.org/I4210086143"]},{"raw_affiliation_string":"Alibaba Group, DAMO Academy","institution_ids":["https://openalex.org/I4210086143"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054067088","display_name":"Min Yang","orcid":"https://orcid.org/0000-0003-3814-2728"},"institutions":[{"id":"https://openalex.org/I4210145761","display_name":"Shenzhen Institutes of Advanced Technology","ror":"https://ror.org/04gh4er46","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210145761"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Min Yang","raw_affiliation_strings":["Shenzhen Institute of Advanced Technology,Chinese Academy of Sciences","Chinese Academy of Sciences, Shenzhen Institute of Advanced Technology"],"affiliations":[{"raw_affiliation_string":"Shenzhen Institute of Advanced Technology,Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210145761","https://openalex.org/I19820366"]},{"raw_affiliation_string":"Chinese Academy of Sciences, Shenzhen Institute of Advanced Technology","institution_ids":["https://openalex.org/I4210145761","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100620486","display_name":"Fei Huang","orcid":"https://orcid.org/0000-0001-9665-6642"},"institutions":[{"id":"https://openalex.org/I4210086143","display_name":"Alibaba Group (Cayman Islands)","ror":"https://ror.org/00mnrxf72","country_code":"KY","type":"company","lineage":["https://openalex.org/I4210086143","https://openalex.org/I45928872"]}],"countries":["KY"],"is_corresponding":false,"raw_author_name":"Fei Huang","raw_affiliation_strings":["DAMO Academy,Alibaba Group","Alibaba Group, DAMO Academy"],"affiliations":[{"raw_affiliation_string":"DAMO Academy,Alibaba Group","institution_ids":["https://openalex.org/I4210086143"]},{"raw_affiliation_string":"Alibaba Group, DAMO Academy","institution_ids":["https://openalex.org/I4210086143"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100644428","display_name":"Yongbin Li","orcid":"https://orcid.org/0009-0008-4504-2163"},"institutions":[{"id":"https://openalex.org/I4210086143","display_name":"Alibaba Group (Cayman Islands)","ror":"https://ror.org/00mnrxf72","country_code":"KY","type":"company","lineage":["https://openalex.org/I4210086143","https://openalex.org/I45928872"]}],"countries":["KY"],"is_corresponding":false,"raw_author_name":"Yongbin Li","raw_affiliation_strings":["DAMO Academy,Alibaba Group","Alibaba Group, DAMO Academy"],"affiliations":[{"raw_affiliation_string":"DAMO Academy,Alibaba Group","institution_ids":["https://openalex.org/I4210086143"]},{"raw_affiliation_string":"Alibaba Group, DAMO Academy","institution_ids":["https://openalex.org/I4210086143"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101495496"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.2446,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.45963597,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"12421","last_page":"12425"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9879999756813049,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8270329236984253},{"id":"https://openalex.org/keywords/dialog-box","display_name":"Dialog box","score":0.8109692931175232},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6110515594482422},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6032271981239319},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5806439518928528},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5344656705856323},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5324211120605469},{"id":"https://openalex.org/keywords/dialog-system","display_name":"Dialog system","score":0.5307946801185608},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.49151191115379333},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.4798985719680786},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4585241675376892},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.40346840023994446},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3408641219139099},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.07496494054794312}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8270329236984253},{"id":"https://openalex.org/C173853756","wikidata":"https://www.wikidata.org/wiki/Q86915","display_name":"Dialog box","level":2,"score":0.8109692931175232},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6110515594482422},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6032271981239319},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5806439518928528},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5344656705856323},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5324211120605469},{"id":"https://openalex.org/C190954187","wikidata":"https://www.wikidata.org/wiki/Q5270587","display_name":"Dialog system","level":3,"score":0.5307946801185608},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.49151191115379333},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.4798985719680786},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4585241675376892},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.40346840023994446},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3408641219139099},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.07496494054794312},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10448111","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10448111","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1121271761","display_name":null,"funder_award_id":"Program","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2981938667","display_name":null,"funder_award_id":"Shenzhen","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3757194791","display_name":null,"funder_award_id":"JCYJ20","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5197416087","display_name":null,"funder_award_id":"2022YFF0902100","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G5881942141","display_name":null,"funder_award_id":"202103","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5939423041","display_name":null,"funder_award_id":"Technology","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6163091765","display_name":null,"funder_award_id":"2020010","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6437360502","display_name":null,"funder_award_id":"2021032","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7174558747","display_name":null,"funder_award_id":"Group","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7310247692","display_name":null,"funder_award_id":"62376262","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G764773819","display_name":null,"funder_award_id":"KQTD20190929172835662","funder_id":"https://openalex.org/F4320336569","funder_display_name":"Shenzhen Science and Technology Innovation Program"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null},{"id":"https://openalex.org/F4320336569","display_name":"Shenzhen Science and Technology Innovation Program","ror":null},{"id":"https://openalex.org/F4320337504","display_name":"Research and Development","ror":"https://ror.org/027s68j25"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W2908510526","https://openalex.org/W3035448310","https://openalex.org/W3174119180","https://openalex.org/W3174770825","https://openalex.org/W3205638321","https://openalex.org/W3208314443","https://openalex.org/W4205119860","https://openalex.org/W4205991051","https://openalex.org/W4225323055","https://openalex.org/W4292779060","https://openalex.org/W4307079201","https://openalex.org/W4380609152","https://openalex.org/W4385571466","https://openalex.org/W4385571861","https://openalex.org/W4385574214","https://openalex.org/W4386071687","https://openalex.org/W4386187806","https://openalex.org/W6757817989","https://openalex.org/W6778883912","https://openalex.org/W6791353385","https://openalex.org/W6810334672","https://openalex.org/W6811013733","https://openalex.org/W6839015040","https://openalex.org/W6847076894"],"related_works":["https://openalex.org/W48079147","https://openalex.org/W2394821827","https://openalex.org/W326836678","https://openalex.org/W2563921006","https://openalex.org/W1963944933","https://openalex.org/W2111550420","https://openalex.org/W1977846844","https://openalex.org/W2549666521","https://openalex.org/W3133893348","https://openalex.org/W2901575119"],"abstract_inverted_index":{"Recently,":[0],"substantial":[1],"advancements":[2],"in":[3],"pre-trained":[4,30,88],"visionlanguage":[5],"models":[6,17,31],"have":[7,18],"greatly":[8],"enhanced":[9],"the":[10,28,37,46,87,99,103,162,168,181],"capabilities":[11],"of":[12,49,111,161,172,183],"multi-modal":[13,64,72,125,184],"dialog":[14,50,65,105,185],"systems.":[15],"These":[16,165],"demonstrated":[19],"significant":[20],"improvements":[21],"by":[22,156],"fine-tuning":[23],"on":[24,34,146],"downstream":[25,104],"tasks.":[26],"However,":[27],"existing":[29],"primarily":[32],"focus":[33],"effectively":[35],"capturing":[36],"alignment":[38],"between":[39],"vision":[40],"and":[41,154,170],"language":[42],"modalities,":[43],"often":[44],"ignoring":[45],"intricate":[47],"nature":[48],"context.":[51],"In":[52],"this":[53],"paper,":[54],"we":[55,93,113],"propose":[56],"a":[57,71,158],"parameter-efficient":[58],"prompt-tuning":[59],"method":[60],"named":[61],"DialCLIP":[62,142],"for":[63],"retrieval.":[66,186],"Specifically,":[67],"our":[68,173],"approach":[69],"introduces":[70],"context":[73,78],"prompt":[74,96],"generator":[75],"to":[76,97,118,124,133,179],"learn":[77,119],"features":[79],"which":[80],"are":[81],"subsequently":[82],"distilled":[83],"into":[84],"prompts":[85],"within":[86],"vision-language":[89],"model":[90],"CLIP.":[91],"Besides,":[92],"introduce":[94],"domain":[95],"mitigate":[98],"disc":[100],"repancy":[101],"from":[102,121],"data.":[106],"To":[107],"facilitate":[108],"various":[109],"types":[110],"retrieval,":[112],"also":[114],"design":[115],"multiple":[116],"experts":[117],"mappings":[120],"CLIP":[122],"outputs":[123],"representation":[126],"space,":[127],"with":[128],"each":[129],"expert":[130],"being":[131],"responsible":[132],"one":[134],"specific":[135],"retrieval":[136],"type.":[137],"Extensive":[138],"experiments":[139],"show":[140],"that":[141],"achieves":[143],"stateof-the-art":[144],"performance":[145],"two":[147],"widely":[148],"recognized":[149],"benchmark":[150],"datasets":[151],"(i.e.,":[152],"PhotoChat":[153],"MMDialog)":[155],"tuning":[157],"mere":[159],"0.04%":[160],"total":[163],"parameters.":[164],"results":[166],"highlight":[167],"efficacy":[169],"efficiency":[171],"proposed":[174],"approach,":[175],"underscoring":[176],"its":[177],"potential":[178],"advance":[180],"field":[182]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
