{"id":"https://openalex.org/W4401455192","doi":"https://doi.org/10.1145/3686805","title":"VITR: Augmenting Vision Transformers with Relation-Focused Learning for Cross-modal Information Retrieval","display_name":"VITR: Augmenting Vision Transformers with Relation-Focused Learning for Cross-modal Information Retrieval","publication_year":2024,"publication_date":"2024-08-09","ids":{"openalex":"https://openalex.org/W4401455192","doi":"https://doi.org/10.1145/3686805"},"language":"en","primary_location":{"id":"doi:10.1145/3686805","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3686805","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3686805","source":{"id":"https://openalex.org/S41523882","display_name":"ACM Transactions on Knowledge Discovery from Data","issn_l":"1556-4681","issn":["1556-4681","1556-472X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Knowledge Discovery from Data","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3686805","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5074385870","display_name":"Yan Gong","orcid":"https://orcid.org/0000-0003-2853-2108"},"institutions":[{"id":"https://openalex.org/I9300472","display_name":"Bournemouth University","ror":"https://ror.org/05wwcw481","country_code":"GB","type":"education","lineage":["https://openalex.org/I9300472"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Yan Gong","raw_affiliation_strings":["Department of Computing and Informatics, Bournemouth University, Bournemouth, UK"],"affiliations":[{"raw_affiliation_string":"Department of Computing and Informatics, Bournemouth University, Bournemouth, UK","institution_ids":["https://openalex.org/I9300472"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022173527","display_name":"Georgina Cosma","orcid":"https://orcid.org/0000-0002-4663-6907"},"institutions":[{"id":"https://openalex.org/I143804889","display_name":"Loughborough University","ror":"https://ror.org/04vg4w365","country_code":"GB","type":"education","lineage":["https://openalex.org/I143804889"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Georgina Cosma","raw_affiliation_strings":["Department of Computer Science, Loughborough University, Loughborough, UK"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Loughborough University, Loughborough, UK","institution_ids":["https://openalex.org/I143804889"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5005424936","display_name":"Axel Finke","orcid":"https://orcid.org/0000-0002-8379-3012"},"institutions":[{"id":"https://openalex.org/I143804889","display_name":"Loughborough University","ror":"https://ror.org/04vg4w365","country_code":"GB","type":"education","lineage":["https://openalex.org/I143804889"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Axel Finke","raw_affiliation_strings":["Department of Mathematical Sciences, Loughborough University, Loughborough, UK"],"affiliations":[{"raw_affiliation_string":"Department of Mathematical Sciences, Loughborough University, Loughborough, UK","institution_ids":["https://openalex.org/I143804889"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5074385870"],"corresponding_institution_ids":["https://openalex.org/I9300472"],"apc_list":null,"apc_paid":null,"fwci":0.4935,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.6374591,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":"18","issue":"9","first_page":"1","last_page":"21"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.787078857421875},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5951476693153381},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5760657787322998},{"id":"https://openalex.org/keywords/relation","display_name":"Relation (database)","score":0.5160769820213318},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5123961567878723},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.502100944519043},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.43682006001472473},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3589828908443451},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.35212913155555725},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3169844448566437}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.787078857421875},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5951476693153381},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5760657787322998},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.5160769820213318},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5123961567878723},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.502100944519043},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.43682006001472473},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3589828908443451},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.35212913155555725},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3169844448566437},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3686805","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3686805","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3686805","source":{"id":"https://openalex.org/S41523882","display_name":"ACM Transactions on Knowledge Discovery from Data","issn_l":"1556-4681","issn":["1556-4681","1556-472X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Knowledge Discovery from Data","raw_type":"journal-article"},{"id":"pmh:oai:eprints.bournemouth.ac.uk:40585","is_oa":true,"landing_page_url":null,"pdf_url":"https://eprints.bournemouth.ac.uk/40585/1/VITR%20Augmenting.pdf","source":{"id":"https://openalex.org/S4306400187","display_name":"Bournemouth University Research Online (Bournemouth University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I9300472","host_organization_name":"Bournemouth University","host_organization_lineage":["https://openalex.org/I9300472"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"PeerReviewed"}],"best_oa_location":{"id":"doi:10.1145/3686805","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3686805","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3686805","source":{"id":"https://openalex.org/S41523882","display_name":"ACM Transactions on Knowledge Discovery from Data","issn_l":"1556-4681","issn":["1556-4681","1556-472X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Knowledge Discovery from Data","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.6700000166893005,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4401455192.pdf"},"referenced_works_count":51,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W1861492603","https://openalex.org/W2157331557","https://openalex.org/W2185175083","https://openalex.org/W2482105843","https://openalex.org/W2524324335","https://openalex.org/W2561715562","https://openalex.org/W2596164567","https://openalex.org/W2889811054","https://openalex.org/W2962964995","https://openalex.org/W2963109634","https://openalex.org/W2985331920","https://openalex.org/W2987119394","https://openalex.org/W2988823324","https://openalex.org/W2998356391","https://openalex.org/W3035688398","https://openalex.org/W3088460296","https://openalex.org/W3090449556","https://openalex.org/W3115656231","https://openalex.org/W3118694826","https://openalex.org/W3135367836","https://openalex.org/W3154992183","https://openalex.org/W3161944108","https://openalex.org/W3166396011","https://openalex.org/W3168114581","https://openalex.org/W3173909648","https://openalex.org/W3175888430","https://openalex.org/W3181758331","https://openalex.org/W3184735396","https://openalex.org/W3186513731","https://openalex.org/W3186617021","https://openalex.org/W3198064418","https://openalex.org/W3198555355","https://openalex.org/W4210894218","https://openalex.org/W4220897773","https://openalex.org/W4220974559","https://openalex.org/W4221166573","https://openalex.org/W4286611278","https://openalex.org/W4288046368","https://openalex.org/W4304014690","https://openalex.org/W4304080498","https://openalex.org/W4312055654","https://openalex.org/W4312749754","https://openalex.org/W4313178921","https://openalex.org/W4313181088","https://openalex.org/W4313591130","https://openalex.org/W4317935377","https://openalex.org/W4320341494","https://openalex.org/W4323927080","https://openalex.org/W4386065752","https://openalex.org/W6796617330"],"related_works":["https://openalex.org/W4234874385","https://openalex.org/W2323648130","https://openalex.org/W2157140558","https://openalex.org/W2378782423","https://openalex.org/W4390516098","https://openalex.org/W2181948922","https://openalex.org/W2388988621","https://openalex.org/W2357797405","https://openalex.org/W2384362569","https://openalex.org/W2366623913"],"abstract_inverted_index":{"The":[0,170,186],"relations":[1,113,144],"expressed":[2],"in":[3,48,59,64,146,210],"user":[4],"queries":[5],"are":[6],"vital":[7],"for":[8,44],"cross-modal":[9,13,50,133,183],"information":[10,18,184],"retrieval.":[11,185,214],"Relation-focused":[12],"retrieval":[14,26],"aims":[15],"to":[16,21,67,77,138,162],"retrieve":[17],"that":[19,103,154,201],"corresponds":[20],"these":[22,60],"relations,":[23],"enabling":[24,136],"effective":[25],"across":[27],"different":[28],"modalities.":[29],"Pre-trained":[30],"networks,":[31,37],"such":[32],"as":[33],"Contrastive":[34],"Language-Image":[35],"Pre-training":[36],"have":[38],"gained":[39],"significant":[40],"attention":[41],"and":[42,94,108,140,168,197,212],"acclaim":[43],"their":[45],"exceptional":[46],"performance":[47],"various":[49],"learning":[51],"tasks.":[52],"However,":[53],"the":[54,84,89,129,156,179,190,193,202],"Vision":[55],"Transformer":[56],"(ViT)":[57],"used":[58],"networks":[61,134,209],"is":[62,75,120],"limited":[63],"its":[65],"ability":[66],"focus":[68],"on":[69,115,178],"image":[70,92,111],"region":[71,112,143],"relations.":[72],"Specifically,":[73],"ViT":[74,105],"trained":[76],"match":[78],"images":[79,167],"with":[80,142,159],"relevant":[81],"descriptions":[82],"at":[83],"global":[85,160],"level,":[86],"without":[87],"considering":[88],"alignment":[90],"between":[91,166],"regions":[93],"descriptions.":[95,169],"This":[96],"article":[97],"introduces":[98],"VITR,":[99],"a":[100,116,151],"novel":[101],"network":[102,173,205],"enhances":[104],"by":[106,135],"extracting":[107],"reasoning":[109],"about":[110],"based":[114],"local":[117],"encoder.":[118],"VITR":[119,149,172,204],"comprised":[121],"of":[122,131,181,192],"two":[123],"key":[124],"components.":[125],"Firstly,":[126],"it":[127],"extends":[128],"capabilities":[130],"ViT-based":[132],"them":[137],"extract":[139],"reason":[141],"present":[145],"images.":[147],"Secondly,":[148],"incorporates":[150],"fusion":[152],"module":[153],"combines":[155],"reasoned":[157],"results":[158,187],"knowledge":[161],"predict":[163],"similarity":[164],"scores":[165],"proposed":[171,203],"was":[174],"evaluated":[175],"through":[176],"experiments":[177],"tasks":[180],"relation-focused":[182],"derived":[188],"from":[189],"analysis":[191],"Flickr30K,":[194],"MS-COCO,":[195],"RefCOCOg,":[196],"CLEVR":[198],"datasets":[199],"demonstrated":[200],"consistently":[206],"outperforms":[207],"state-of-the-art":[208],"image-to-text":[211],"text-to-image":[213]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1},{"year":2023,"cited_by_count":2}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
