{"id":"https://openalex.org/W4400525916","doi":"https://doi.org/10.1145/3626772.3657833","title":"M <sup>2</sup> -RAAP: A Multi-Modal Recipe for Advancing Adaptation-based Pre-training towards Effective and Efficient Zero-shot Video-text Retrieval","display_name":"M <sup>2</sup> -RAAP: A Multi-Modal Recipe for Advancing Adaptation-based Pre-training towards Effective and Efficient Zero-shot Video-text Retrieval","publication_year":2024,"publication_date":"2024-07-10","ids":{"openalex":"https://openalex.org/W4400525916","doi":"https://doi.org/10.1145/3626772.3657833"},"language":"en","primary_location":{"id":"doi:10.1145/3626772.3657833","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3626772.3657833","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036171993","display_name":"Xingning Dong","orcid":"https://orcid.org/0000-0002-0245-9064"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xingning Dong","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-0245-9064","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081941920","display_name":"Zipeng Feng","orcid":"https://orcid.org/0009-0002-0908-9200"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zipeng Feng","raw_affiliation_strings":["Ant Group, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0002-0908-9200","affiliations":[{"raw_affiliation_string":"Ant Group, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063248074","display_name":"Chunluan Zhou","orcid":"https://orcid.org/0000-0003-0284-6256"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chunluan Zhou","raw_affiliation_strings":["Ant Group, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-0284-6256","affiliations":[{"raw_affiliation_string":"Ant Group, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031500761","display_name":"Xuzheng Yu","orcid":"https://orcid.org/0009-0000-9752-799X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xuzheng Yu","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0000-9752-799X","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019934281","display_name":"Ming Yang","orcid":"https://orcid.org/0000-0003-1691-6817"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ming Yang","raw_affiliation_strings":["Ant Group, Hangzhou, CA, China"],"raw_orcid":"https://orcid.org/0000-0003-1691-6817","affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, CA, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5086923590","display_name":"Qingpei Guo","orcid":"https://orcid.org/0009-0001-0521-9664"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qingpei Guo","raw_affiliation_strings":["Ant Group, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0001-0521-9664","affiliations":[{"raw_affiliation_string":"Ant Group, Beijing, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5036171993"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.2381,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.49057617,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"2156","last_page":"2166"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/recipe","display_name":"Recipe","score":0.8545670509338379},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7460222244262695},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.6778704524040222},{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.6404865384101868},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.6391212940216064},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.5914151668548584},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5479124784469604},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.432059645652771},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4225784242153168},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4188433885574341},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.33176493644714355},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.32984036207199097},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.08619093894958496},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.06609031558036804}],"concepts":[{"id":"https://openalex.org/C2778671685","wikidata":"https://www.wikidata.org/wiki/Q219239","display_name":"Recipe","level":2,"score":0.8545670509338379},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7460222244262695},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6778704524040222},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.6404865384101868},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.6391212940216064},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.5914151668548584},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5479124784469604},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.432059645652771},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4225784242153168},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4188433885574341},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33176493644714355},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.32984036207199097},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.08619093894958496},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.06609031558036804},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C31903555","wikidata":"https://www.wikidata.org/wiki/Q1637030","display_name":"Food science","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3626772.3657833","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3626772.3657833","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320323817","display_name":"Universitas Brawijaya","ror":"https://ror.org/01wk3d929"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2425121537","https://openalex.org/W2886641317","https://openalex.org/W2914699769","https://openalex.org/W2963017553","https://openalex.org/W2963916161","https://openalex.org/W2981851019","https://openalex.org/W2984008963","https://openalex.org/W2989322838","https://openalex.org/W2999905431","https://openalex.org/W3011411500","https://openalex.org/W3035265375","https://openalex.org/W3121142147","https://openalex.org/W3121304688","https://openalex.org/W3135751354","https://openalex.org/W3168640669","https://openalex.org/W3197457832","https://openalex.org/W3204588463","https://openalex.org/W3204670646","https://openalex.org/W4221142658","https://openalex.org/W4225868495","https://openalex.org/W4285294723","https://openalex.org/W4304092645","https://openalex.org/W4306820534","https://openalex.org/W4312784228","https://openalex.org/W4377021900","https://openalex.org/W4385849199","https://openalex.org/W4386065892","https://openalex.org/W4386076010","https://openalex.org/W4386076424","https://openalex.org/W4386942867","https://openalex.org/W4387968589","https://openalex.org/W4390871765","https://openalex.org/W4390873441","https://openalex.org/W6600234944","https://openalex.org/W6600339963","https://openalex.org/W6600376255","https://openalex.org/W6600504320"],"related_works":["https://openalex.org/W258429745","https://openalex.org/W3161239248","https://openalex.org/W1584543623","https://openalex.org/W2561508161","https://openalex.org/W3195543079","https://openalex.org/W2098178683","https://openalex.org/W2740680361","https://openalex.org/W3207562294","https://openalex.org/W3187068967","https://openalex.org/W2604742737"],"abstract_inverted_index":{"We":[0,77,129],"present":[1],"a":[2,46,174],"Recipe":[3],"for":[4,154],"Effective":[5],"and":[6,37,62,72,97,119,150,169,183,188],"Efficient":[7],"zero-shot":[8,180],"video-text":[9,22,54,106,142],"Retrieval,":[10],"dubbed":[11],"M2-RAAP.":[12],"Upon":[13],"popular":[14],"image-text":[15,136],"models":[16,138],"like":[17],"CLIP,":[18],"most":[19],"current":[20],"adaptation-based":[21,155],"pre-training":[23],"methods":[24],"are":[25,193],"confronted":[26],"by":[27,133],"three":[28,135],"major":[29],"issues,":[30],"i.e.,":[31],"noisy":[32],"data":[33,60,95,167,191],"corpus,":[34],"time-consuming":[35],"pre-training,":[36,118],"limited":[38],"performance":[39,163],"gain.":[40],"Towards":[41],"this":[42,80],"end,":[43],"we":[44,57],"conduct":[45,130],"comprehensive":[47],"study":[48,82],"including":[49],"four":[50,178],"critical":[51],"steps":[52],"in":[53,92,102],"pre-training.":[55,156],"Specifically,":[56],"investigate":[58],"1)":[59,93],"filtering":[61,96],"refinement,":[63],"2)":[64,108],"video":[65,74,112,127],"input":[66],"type":[67],"selection,":[68],"3)":[69,120],"temporal":[70],"modeling,":[71],"4)":[73],"feature":[75],"enhancement.":[76],"then":[78],"summarize":[79],"empirical":[81],"into":[83],"the":[84,94,109,121,148],"M2-RAAP":[85,153,160],"recipe,":[86],"where":[87],"our":[88],"technical":[89],"contributions":[90],"lie":[91],"text":[98],"re-writing":[99],"pipeline":[100],"resulting":[101],"1M":[103],"high-quality":[104],"bilingual":[105,190],"pairs,":[107],"promotion":[110],"of":[111,152],"inputs":[113],"with":[114,164],"key-frames":[115],"to":[116,125],"accelerate":[117],"Auxiliary-Caption-Guided":[122],"(ACG)":[123],"strategy":[124],"enhance":[126],"features.":[128],"extensive":[131],"experiments":[132],"adapting":[134],"foundation":[137],"on":[139,177],"two":[140,184],"refined":[141,189],"datasets":[143,182],"from":[144],"different":[145],"languages,":[146],"validating":[147],"robustness":[149],"reproducibility":[151],"Results":[157],"demonstrate":[158],"that":[159],"yields":[161],"superior":[162],"significantly":[165],"less":[166],"(-90%)":[168],"time":[170],"consumption":[171],"(-95%),":[172],"establishing":[173],"new":[175],"SOTA":[176],"English":[179],"retrieval":[181],"Chinese":[185],"ones.":[186],"Codebase":[187],"annotations":[192],"available":[194],"at":[195],"https://github.com/alipay/Ant-Multi-Modal-Framework/tree/main/prj/M2_RAAP.":[196]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
