{"id":"https://openalex.org/W3201980333","doi":"https://doi.org/10.1145/3474085.3475637","title":"Dense Contrastive Visual-Linguistic Pretraining","display_name":"Dense Contrastive Visual-Linguistic Pretraining","publication_year":2021,"publication_date":"2021-10-17","ids":{"openalex":"https://openalex.org/W3201980333","doi":"https://doi.org/10.1145/3474085.3475637","mag":"3201980333"},"language":"en","primary_location":{"id":"doi:10.1145/3474085.3475637","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3474085.3475637","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052255801","display_name":"Lei Shi","orcid":"https://orcid.org/0000-0002-5570-7818"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Lei Shi","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012633404","display_name":"Kai Shuang","orcid":"https://orcid.org/0000-0003-0917-3541"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Shuang","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080973846","display_name":"Shijie Geng","orcid":null},"institutions":[{"id":"https://openalex.org/I102322142","display_name":"Rutgers, The State University of New Jersey","ror":"https://ror.org/05vt9qd57","country_code":"US","type":"education","lineage":["https://openalex.org/I102322142"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shijie Geng","raw_affiliation_strings":["Rutgers University, Piscataway, NJ, USA"],"affiliations":[{"raw_affiliation_string":"Rutgers University, Piscataway, NJ, USA","institution_ids":["https://openalex.org/I102322142"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101986762","display_name":"Peng Gao","orcid":"https://orcid.org/0000-0002-7908-6075"},"institutions":[{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Gao","raw_affiliation_strings":["Shanghai AI Laboratory, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai AI Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I4391012619"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017387871","display_name":"Zuohui Fu","orcid":"https://orcid.org/0000-0002-3881-7935"},"institutions":[{"id":"https://openalex.org/I102322142","display_name":"Rutgers, The State University of New Jersey","ror":"https://ror.org/05vt9qd57","country_code":"US","type":"education","lineage":["https://openalex.org/I102322142"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zuohui Fu","raw_affiliation_strings":["Rutgers University, Piscataway, NJ, USA"],"affiliations":[{"raw_affiliation_string":"Rutgers University, Piscataway, NJ, USA","institution_ids":["https://openalex.org/I102322142"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085818578","display_name":"Gerard de Melo","orcid":"https://orcid.org/0000-0002-2930-2059"},"institutions":[{"id":"https://openalex.org/I176453806","display_name":"University of Potsdam","ror":"https://ror.org/03bnmw459","country_code":"DE","type":"education","lineage":["https://openalex.org/I176453806"]},{"id":"https://openalex.org/I143288331","display_name":"Hasso Plattner Institute","ror":"https://ror.org/058rn5r42","country_code":"DE","type":"facility","lineage":["https://openalex.org/I143288331","https://openalex.org/I176453806"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Gerard de Melo","raw_affiliation_strings":["Hasso Plattner Institute, University of Potsdam, Potsdam, Germany"],"affiliations":[{"raw_affiliation_string":"Hasso Plattner Institute, University of Potsdam, Potsdam, Germany","institution_ids":["https://openalex.org/I143288331","https://openalex.org/I176453806"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029978692","display_name":"Yunpeng Chen","orcid":"https://orcid.org/0000-0002-9830-8980"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yunpeng Chen","raw_affiliation_strings":["YITU Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"YITU Technology, Beijing, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5036865453","display_name":"Sen Su","orcid":"https://orcid.org/0000-0003-4266-7527"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sen Su","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5052255801"],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":0.8646,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.75870915,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"5203","last_page":"5212"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.752812385559082},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.666662335395813},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5158945322036743},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.48044440150260925},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4152790606021881},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3380669057369232},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.32656899094581604},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.18461874127388}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.752812385559082},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.666662335395813},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5158945322036743},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.48044440150260925},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4152790606021881},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3380669057369232},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32656899094581604},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.18461874127388},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3474085.3475637","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3474085.3475637","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5600000023841858,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2108598243","https://openalex.org/W2138621090","https://openalex.org/W2163605009","https://openalex.org/W2277195237","https://openalex.org/W2321533354","https://openalex.org/W2525579820","https://openalex.org/W2543927648","https://openalex.org/W2613718673","https://openalex.org/W2741631785","https://openalex.org/W2745461083","https://openalex.org/W2798991696","https://openalex.org/W2886641317","https://openalex.org/W2913939497","https://openalex.org/W2962964995","https://openalex.org/W2963115613","https://openalex.org/W2963403868","https://openalex.org/W2963420272","https://openalex.org/W2964067226","https://openalex.org/W2964284374","https://openalex.org/W2966683369","https://openalex.org/W3004349648","https://openalex.org/W3006647218","https://openalex.org/W3021735172"],"related_works":["https://openalex.org/W2382607599","https://openalex.org/W4287995534","https://openalex.org/W2998168123","https://openalex.org/W2592385986","https://openalex.org/W2944661354","https://openalex.org/W2573334707","https://openalex.org/W2905846897","https://openalex.org/W2546942002","https://openalex.org/W2743674619","https://openalex.org/W2970216048"],"abstract_inverted_index":{"Inspired":[0],"by":[1,25],"the":[2,56,67,97,124,161],"success":[3],"of":[4,58,126,146,163],"BERT,":[5],"several":[6],"multimodal":[7,32,168],"representation":[8,169],"learning":[9,106,140,166],"approaches":[10,21],"have":[11],"been":[12,71],"proposed":[13],"that":[14,107],"jointly":[15],"represent":[16],"image":[17],"and":[18,37,44,61,79,100,117],"text.":[19],"These":[20],"achieve":[22],"superior":[23],"performance":[24],"capturing":[26],"high-level":[27],"semantic":[28,63,81],"information":[29],"from":[30,55],"large-scale":[31],"pretraining.":[33],"In":[34],"particular,":[35],"LXMERT":[36],"UNITER":[38],"adopt":[39],"visual":[40,68],"region":[41,98,104,138],"feature":[42],"regression":[43,99],"label":[45],"classification":[46,101],"as":[47],"pretext":[48],"tasks.":[49],"However,":[50],"they":[51],"tend":[52],"to":[53,122,159],"suffer":[54],"problems":[57],"noisy":[59],"labels":[60],"sparse":[62],"annotations,":[64],"based":[65],"on":[66,73,167],"features":[69],"having":[70],"pretrained":[72],"a":[74,142],"crowdsourced":[75],"dataset":[76],"with":[77,102],"limited":[78],"inconsistent":[80],"labeling.":[82],"To":[83],"overcome":[84],"these":[85],"issues,":[86],"we":[87],"propose":[88],"unbiased":[89],"Dense":[90],"Contrastive":[91],"Visual-Linguistic":[92],"Pretraining":[93],"(DCVLP),":[94],"which":[95],"replaces":[96],"cross-modality":[103,136],"contrastive":[105,131,139,165],"requires":[108],"no":[109],"annotations.":[110,149],"Two":[111],"data":[112],"augmentation":[113],"strategies":[114],"(Mask":[115],"Perturbation":[116],"Intra-Inter-Adversarial":[118],"Perturbation)":[119],"are":[120],"developed":[121],"improve":[123],"quality":[125],"negative":[127],"samples":[128],"used":[129],"in":[130,141],"learning.":[132,170],"Overall,":[133],"DCVLP":[134],"allows":[135],"dense":[137,164],"self-supervised":[143],"setting":[144],"independent":[145],"any":[147],"object":[148],"We":[150],"compare":[151],"our":[152],"method":[153],"against":[154],"prior":[155],"visual-linguistic":[156],"pretraining":[157],"frameworks":[158],"validate":[160],"superiority":[162]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
