{"id":"https://openalex.org/W2963920257","doi":"https://doi.org/10.1145/3323873.3325035","title":"Self-Supervised Visual Representations for Cross-Modal Retrieval","display_name":"Self-Supervised Visual Representations for Cross-Modal Retrieval","publication_year":2019,"publication_date":"2019-06-05","ids":{"openalex":"https://openalex.org/W2963920257","doi":"https://doi.org/10.1145/3323873.3325035","mag":"2963920257"},"language":"en","primary_location":{"id":"doi:10.1145/3323873.3325035","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3323873.3325035","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3323873.3325035","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2019 on International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3323873.3325035","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5034992443","display_name":"Yash Patel","orcid":"https://orcid.org/0000-0001-9373-529X"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yash Patel","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, PA, USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008562740","display_name":"Llu\u00eds G\u00f3mez","orcid":"https://orcid.org/0000-0003-1408-9803"},"institutions":[{"id":"https://openalex.org/I123044942","display_name":"Universitat Aut\u00f2noma de Barcelona","ror":"https://ror.org/052g8jq94","country_code":"ES","type":"education","lineage":["https://openalex.org/I123044942"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Lluis Gomez","raw_affiliation_strings":["Universitat Autonoma de Barcelona, Barcelona, Spain"],"affiliations":[{"raw_affiliation_string":"Universitat Autonoma de Barcelona, Barcelona, Spain","institution_ids":["https://openalex.org/I123044942"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026710474","display_name":"Mar\u00e7al Rusi\u00f1ol","orcid":"https://orcid.org/0000-0002-1734-2205"},"institutions":[{"id":"https://openalex.org/I123044942","display_name":"Universitat Aut\u00f2noma de Barcelona","ror":"https://ror.org/052g8jq94","country_code":"ES","type":"education","lineage":["https://openalex.org/I123044942"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Mar\u00e7al Rusi\u00f1ol","raw_affiliation_strings":["Universitat Autonoma de Barcelona, Barcelona, Spain"],"affiliations":[{"raw_affiliation_string":"Universitat Autonoma de Barcelona, Barcelona, Spain","institution_ids":["https://openalex.org/I123044942"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029375723","display_name":"D\u00ecmosthenis Karatzas","orcid":"https://orcid.org/0000-0001-8762-4454"},"institutions":[{"id":"https://openalex.org/I123044942","display_name":"Universitat Aut\u00f2noma de Barcelona","ror":"https://ror.org/052g8jq94","country_code":"ES","type":"education","lineage":["https://openalex.org/I123044942"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Dimosthenis Karatzas","raw_affiliation_strings":["Universitat Autonoma de Barcelona, Barcelona, Spain"],"affiliations":[{"raw_affiliation_string":"Universitat Autonoma de Barcelona, Barcelona, Spain","institution_ids":["https://openalex.org/I123044942"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5053112307","display_name":"C. V. Jawahar","orcid":"https://orcid.org/0000-0001-6767-7057"},"institutions":[{"id":"https://openalex.org/I65181880","display_name":"Indian Institute of Technology Hyderabad","ror":"https://ror.org/01j4v3x97","country_code":"IN","type":"education","lineage":["https://openalex.org/I65181880"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"C.V. Jawahar","raw_affiliation_strings":["CVIT, KCIS, IIIT, Hyderabad, India, Hyderabad, India"],"affiliations":[{"raw_affiliation_string":"CVIT, KCIS, IIIT, Hyderabad, India, Hyderabad, India","institution_ids":["https://openalex.org/I65181880"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5034992443"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":0.53442879,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.68906879,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"182","last_page":"186"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9811000227928162,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8464778661727905},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.7241295576095581},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.690686821937561},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.6681574583053589},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.6541019678115845},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.585724949836731},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5435024499893188},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4975419342517853},{"id":"https://openalex.org/keywords/image-retrieval","display_name":"Image retrieval","score":0.4719718396663666},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.448807030916214},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4285297393798828},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.4144606590270996},{"id":"https://openalex.org/keywords/visual-word","display_name":"Visual Word","score":0.41373205184936523},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3969869613647461},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.30566293001174927}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8464778661727905},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.7241295576095581},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.690686821937561},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.6681574583053589},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.6541019678115845},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.585724949836731},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5435024499893188},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4975419342517853},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.4719718396663666},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.448807030916214},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4285297393798828},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.4144606590270996},{"id":"https://openalex.org/C189391414","wikidata":"https://www.wikidata.org/wiki/Q7936579","display_name":"Visual Word","level":4,"score":0.41373205184936523},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3969869613647461},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.30566293001174927},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3323873.3325035","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3323873.3325035","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3323873.3325035","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2019 on International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3323873.3325035","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3323873.3325035","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3323873.3325035","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2019 on International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.7300000190734863,"display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2963920257.pdf","grobid_xml":"https://content.openalex.org/works/W2963920257.grobid-xml"},"referenced_works_count":42,"referenced_works":["https://openalex.org/W219040644","https://openalex.org/W343636949","https://openalex.org/W630242894","https://openalex.org/W1520997877","https://openalex.org/W1523385540","https://openalex.org/W1686810756","https://openalex.org/W1861492603","https://openalex.org/W1880262756","https://openalex.org/W1897761818","https://openalex.org/W1949478088","https://openalex.org/W1964073652","https://openalex.org/W2013535308","https://openalex.org/W2031489346","https://openalex.org/W2049993534","https://openalex.org/W2053667957","https://openalex.org/W2066852389","https://openalex.org/W2070753207","https://openalex.org/W2071207147","https://openalex.org/W2100235303","https://openalex.org/W2106277773","https://openalex.org/W2108598243","https://openalex.org/W2117539524","https://openalex.org/W2134670479","https://openalex.org/W2137225583","https://openalex.org/W2159373756","https://openalex.org/W2164530430","https://openalex.org/W2164587673","https://openalex.org/W2184188583","https://openalex.org/W2211092169","https://openalex.org/W2287334441","https://openalex.org/W2476034201","https://openalex.org/W2511428026","https://openalex.org/W2520774189","https://openalex.org/W2522148122","https://openalex.org/W2574447816","https://openalex.org/W2591669147","https://openalex.org/W2618530766","https://openalex.org/W2765440071","https://openalex.org/W2816000491","https://openalex.org/W2963420272","https://openalex.org/W2963765995","https://openalex.org/W2963785020"],"related_works":["https://openalex.org/W2063218608","https://openalex.org/W4386105885","https://openalex.org/W2184288218","https://openalex.org/W2947282851","https://openalex.org/W2374066281","https://openalex.org/W4387423606","https://openalex.org/W2071180033","https://openalex.org/W2036058638","https://openalex.org/W2528082075","https://openalex.org/W155590726"],"abstract_inverted_index":{"Cross-modal":[0],"retrieval":[1,65,75,162],"methods":[2],"have":[3],"been":[4],"significantly":[5],"improved":[6],"in":[7,98,111],"last":[8],"years":[9],"with":[10],"the":[11,58,82,89,105,109,125,135,155,169,172],"use":[12],"of":[13,36,48,57,92,108,128,142,168],"deep":[14],"neural":[15],"networks":[16],"and":[17,24,28,86,123],"large-scale":[18,63],"annotated":[19],"datasets":[20,31],"such":[21,30],"as":[22,79,120],"ImageNet":[23,173],"Places.":[25],"However,":[26],"collecting":[27],"annotating":[29],"requires":[32],"a":[33,72,100],"tremendous":[34],"amount":[35],"human":[37],"effort":[38],"and,":[39],"besides,":[40],"their":[41],"annotations":[42],"are":[43,158],"limited":[44],"to":[45,102,118,165],"discrete":[46],"sets":[47],"popular":[49],"visual":[50,145],"classes":[51],"that":[52,77,134,154],"may":[53],"not":[54,139],"be":[55],"representative":[56],"richer":[59],"semantics":[60],"found":[61],"on":[62,88,171],"cross-modal":[64,74,161],"datasets.":[66],"In":[67],"this":[68],"paper,":[69],"we":[70],"present":[71],"self-supervised":[73],"framework":[76],"leverages":[78],"training":[80,99],"data":[81],"correlations":[83],"between":[84],"images":[85],"text":[87],"entire":[90],"set":[91],"Wikipedia":[93],"articles.":[94],"Our":[95,131],"method":[96,137],"consists":[97],"CNN":[101],"predict:":[103],"(1)":[104],"semantic":[106,126],"context":[107,127],"article":[110],"which":[112],"an":[113,121],"image":[114],"is":[115,138],"more":[116],"probable":[117],"appear":[119],"illustration,":[122],"(2)":[124],"its":[129],"caption.":[130],"experiments":[132],"demonstrate":[133],"proposed":[136],"only":[140],"capable":[141],"learning":[143],"discriminative":[144],"representations":[146,157],"for":[147,160],"solving":[148],"vision":[149],"tasks":[150],"like":[151],"classification,":[152],"but":[153],"learned":[156],"better":[159],"when":[163],"compared":[164],"supervised":[166],"pre-training":[167],"network":[170],"dataset.":[174]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":3},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
