{"id":"https://openalex.org/W3206628126","doi":"https://doi.org/10.1145/3462244.3479913","title":"Cross Lingual Video and Text Retrieval: A New Benchmark Dataset and Algorithm","display_name":"Cross Lingual Video and Text Retrieval: A New Benchmark Dataset and Algorithm","publication_year":2021,"publication_date":"2021-10-15","ids":{"openalex":"https://openalex.org/W3206628126","doi":"https://doi.org/10.1145/3462244.3479913","mag":"3206628126"},"language":"en","primary_location":{"id":"doi:10.1145/3462244.3479913","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3462244.3479913","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Multimodal Interaction","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5064447033","display_name":"Jayaprakash Akula","orcid":"https://orcid.org/0000-0002-1612-1064"},"institutions":[{"id":"https://openalex.org/I162827531","display_name":"Indian Institute of Technology Bombay","ror":"https://ror.org/02qyf5152","country_code":"IN","type":"education","lineage":["https://openalex.org/I162827531"]}],"countries":["IN"],"is_corresponding":true,"raw_author_name":"Jayaprakash Akula","raw_affiliation_strings":["Indian Institute Of Technology Bombay, India"],"affiliations":[{"raw_affiliation_string":"Indian Institute Of Technology Bombay, India","institution_ids":["https://openalex.org/I162827531"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100728747","display_name":"Abhishek Singh","orcid":"https://orcid.org/0009-0004-2885-9850"},"institutions":[{"id":"https://openalex.org/I162827531","display_name":"Indian Institute of Technology Bombay","ror":"https://ror.org/02qyf5152","country_code":"IN","type":"education","lineage":["https://openalex.org/I162827531"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Abhishek","raw_affiliation_strings":["Indian Institute of Technology, Bombay, India"],"affiliations":[{"raw_affiliation_string":"Indian Institute of Technology, Bombay, India","institution_ids":["https://openalex.org/I162827531"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089712643","display_name":"Rishabh Dabral","orcid":"https://orcid.org/0009-0004-1245-4146"},"institutions":[{"id":"https://openalex.org/I162827531","display_name":"Indian Institute of Technology Bombay","ror":"https://ror.org/02qyf5152","country_code":"IN","type":"education","lineage":["https://openalex.org/I162827531"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Rishabh Dabral","raw_affiliation_strings":["IIT Bombay, India"],"affiliations":[{"raw_affiliation_string":"IIT Bombay, India","institution_ids":["https://openalex.org/I162827531"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036738038","display_name":"Preethi Jyothi","orcid":null},"institutions":[{"id":"https://openalex.org/I162827531","display_name":"Indian Institute of Technology Bombay","ror":"https://ror.org/02qyf5152","country_code":"IN","type":"education","lineage":["https://openalex.org/I162827531"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Preethi Jyothi","raw_affiliation_strings":["IIT Bombay, India"],"affiliations":[{"raw_affiliation_string":"IIT Bombay, India","institution_ids":["https://openalex.org/I162827531"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5089606464","display_name":"Ganesh Ramakrishnan","orcid":"https://orcid.org/0000-0003-4533-2490"},"institutions":[{"id":"https://openalex.org/I162827531","display_name":"Indian Institute of Technology Bombay","ror":"https://ror.org/02qyf5152","country_code":"IN","type":"education","lineage":["https://openalex.org/I162827531"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Ganesh Ramakrishnan","raw_affiliation_strings":["IIT Bombay, India"],"affiliations":[{"raw_affiliation_string":"IIT Bombay, India","institution_ids":["https://openalex.org/I162827531"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5064447033"],"corresponding_institution_ids":["https://openalex.org/I162827531"],"apc_list":null,"apc_paid":null,"fwci":0.0961,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.40513072,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"595","last_page":"603"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7836748361587524},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7834619879722595},{"id":"https://openalex.org/keywords/video-retrieval","display_name":"Video retrieval","score":0.5373664498329163},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5305524468421936},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.4267769455909729},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.42291608452796936},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3214927017688751}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7836748361587524},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7834619879722595},{"id":"https://openalex.org/C2983174267","wikidata":"https://www.wikidata.org/wiki/Q3775098","display_name":"Video retrieval","level":2,"score":0.5373664498329163},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5305524468421936},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4267769455909729},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.42291608452796936},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3214927017688751},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3462244.3479913","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3462244.3479913","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Multimodal Interaction","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6600000262260437,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W1491389626","https://openalex.org/W1532499126","https://openalex.org/W2096733369","https://openalex.org/W2100799972","https://openalex.org/W2425121537","https://openalex.org/W2526050071","https://openalex.org/W2620629206","https://openalex.org/W2753311918","https://openalex.org/W2796207103","https://openalex.org/W2891456603","https://openalex.org/W2897439619","https://openalex.org/W2908138876","https://openalex.org/W2910905530","https://openalex.org/W2951019013","https://openalex.org/W2963155035","https://openalex.org/W2963350250","https://openalex.org/W2963420686","https://openalex.org/W2963524571","https://openalex.org/W2964241990","https://openalex.org/W2964268240","https://openalex.org/W2965458216","https://openalex.org/W2970045865","https://openalex.org/W3015783745","https://openalex.org/W3099206234","https://openalex.org/W3102887392"],"related_works":["https://openalex.org/W2378211422","https://openalex.org/W4321353415","https://openalex.org/W2745001401","https://openalex.org/W2130974462","https://openalex.org/W2051487156","https://openalex.org/W2028665553","https://openalex.org/W2086519370","https://openalex.org/W972276598","https://openalex.org/W4246352526","https://openalex.org/W2399947890"],"abstract_inverted_index":{"Video":[0],"retrieval":[1,180,218],"using":[2,25,81,207],"natural":[3],"language":[4,216],"queries":[5],"requires":[6],"learning":[7],"semantically":[8,126],"meaningful":[9],"joint":[10,21],"embeddings":[11,22],"between":[12,132],"the":[13,16,51,58,64,91,129,133,136,148,156,208,215,225,229],"text":[14],"and":[15,100,135,151,154,160,170,185,194],"audio-visual":[17,67],"input.":[18],"Often,":[19],"such":[20,123],"are":[23,120,220],"learnt":[24],"pairwise":[26],"(or":[27],"triplet)":[28],"contrastive":[29],"loss":[30],"objectives":[31],"which":[32],"cannot":[33],"give":[34],"enough":[35],"attention":[36],"to":[37,62,75,84,90,111],"\u2018difficult-to-retrieve\u2019":[38],"samples":[39,94],"during":[40],"training.":[41],"This":[42],"problem":[43],"is":[44,53],"especially":[45,213],"pronounced":[46],"in":[47,88,128,140,168,188],"data-scarce":[48,169],"settings":[49],"where":[50],"data":[52,78],"relatively":[54],"small":[55],"(10%":[56],"of":[57,95],"large":[59],"scale":[60],"MSR-VTT)":[61],"cover":[63],"rather":[65],"complex":[66],"embedding":[68,142],"space.":[69],"In":[70],"this":[71],"context,":[72],"we":[73,102,173],"propose":[74],"compensate":[76],"for":[77],"scarcity":[79],"by":[80,223],"domain":[82],"knowledge":[83],"augment":[85],"supervision.":[86],"Specifically,":[87],"addition":[89],"conventional":[92,149],"three":[93],"a":[96,104,108,113,177],"triplet":[97,152],"(anchor,":[98],"positive,":[99],"negative),":[101],"introduce":[103,174],"fourth":[105],"term":[106],"-":[107,110,176],"partial":[109,210],"define":[112],"margin":[114],"based":[115],"partial-order":[116],"loss.":[117],"The":[118],"partials":[119],"heuristically":[121],"sampled":[122],"that":[124,182],"they":[125],"lie":[127],"overlap":[130],"zone":[131],"positives":[134],"negatives,":[137],"thereby":[138],"resulting":[139],"broader":[141],"coverage.":[143],"Our":[144],"proposals":[145],"consistently":[146],"outperform":[147],"max-margin":[150],"losses":[153],"improve":[155],"state-of-the-art":[157],"on":[158,200],"MSR-VTT":[159],"DiDeMO":[161],"datasets.":[162],"To":[163],"further":[164],"evaluate":[165],"our":[166],"method":[167],"low-resource":[171],"setting,":[172],"Rudder":[175,201],"multilingual":[178],"video-text":[179],"dataset":[181],"includes":[183],"audio":[184],"textual":[186],"captions":[187],"Marathi,":[189],"Hindi,":[190],"Tamil,":[191],"Kannada,":[192],"Malayalam":[193],"Telugu.":[195],"We":[196],"report":[197],"benchmark":[198],"results":[199],"while":[202],"also":[203],"observing":[204],"significant":[205],"gains":[206],"proposed":[209],"order":[211],"loss,":[212],"when":[214],"specific":[217],"models":[219],"jointly":[221],"trained":[222],"availing":[224],"cross-lingual":[226],"alignment":[227],"across":[228],"language-specific":[230],"datasets.1":[231]},"counts_by_year":[{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
