{"id":"https://openalex.org/W3197980617","doi":"https://doi.org/10.1145/3463945.3468169","title":"Cross-modal Pretraining and Matching for Video Understanding","display_name":"Cross-modal Pretraining and Matching for Video Understanding","publication_year":2021,"publication_date":"2021-08-21","ids":{"openalex":"https://openalex.org/W3197980617","doi":"https://doi.org/10.1145/3463945.3468169","mag":"3197980617"},"language":"en","primary_location":{"id":"doi:10.1145/3463945.3468169","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3463945.3468169","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 Workshop on Multi-Modal Pre-Training for Multimedia Understanding","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100436505","display_name":"Limin Wang","orcid":"https://orcid.org/0000-0002-3674-7718"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Limin Wang","raw_affiliation_strings":["Nanjing University, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5100436505"],"corresponding_institution_ids":["https://openalex.org/I881766915"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.10857843,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"2"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10515","display_name":"Cancer-related molecular mechanisms research","score":0.9829000234603882,"subfield":{"id":"https://openalex.org/subfields/1306","display_name":"Cancer Research"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8308007121086121},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5873274803161621},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5337753295898438},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5074130892753601},{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.4745158553123474},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.47114959359169006},{"id":"https://openalex.org/keywords/internet-video","display_name":"Internet video","score":0.46410664916038513},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.44133296608924866},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.41741248965263367},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.36087566614151},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.32462838292121887},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.32296502590179443},{"id":"https://openalex.org/keywords/the-internet","display_name":"The Internet","score":0.2031797468662262},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.09602528810501099},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.09432056546211243}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8308007121086121},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5873274803161621},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5337753295898438},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5074130892753601},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.4745158553123474},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.47114959359169006},{"id":"https://openalex.org/C2779789524","wikidata":"https://www.wikidata.org/wiki/Q16885149","display_name":"Internet video","level":3,"score":0.46410664916038513},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.44133296608924866},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.41741248965263367},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.36087566614151},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.32462838292121887},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.32296502590179443},{"id":"https://openalex.org/C110875604","wikidata":"https://www.wikidata.org/wiki/Q75","display_name":"The Internet","level":2,"score":0.2031797468662262},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.09602528810501099},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.09432056546211243},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3463945.3468169","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3463945.3468169","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 Workshop on Multi-Modal Pre-Training for Multimedia Understanding","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.5400000214576721,"id":"https://metadata.un.org/sdg/16"},{"display_name":"Reduced inequalities","score":0.4300000071525574,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3204184292","https://openalex.org/W3176564347","https://openalex.org/W1985458517","https://openalex.org/W2355833770","https://openalex.org/W3031039437","https://openalex.org/W183202219","https://openalex.org/W3095877357","https://openalex.org/W2072565696","https://openalex.org/W2050451745","https://openalex.org/W2080282247"],"abstract_inverted_index":{"Videos":[0],"are":[1,85],"generally":[2],"accompanied":[3,87],"with":[4,267],"multi-modal":[5,14,115],"information":[6,15,91,103,109],"such":[7,43,92],"as":[8,44,93,106,121,134],"audio,":[9],"text,":[10],"and":[11,51,69,98,163,178,231,262,276,289,318,365,384],"motion.":[12],"The":[13],"is":[16,34,243,260],"becoming":[17],"an":[18,36,212],"important":[19],"cue":[20],"for":[21,141,197,226,264,354],"understanding":[22,41,63],"video":[23,40,45,48,52,62,124,142,162,175,181,290],"content.":[24],"How":[25],"to":[26,110,126,138,156,184,215,245,250,281,341,348],"model":[27,210,283],"the":[28,74,130,191,203,265,284,331,361],"correlation":[29,159],"between":[30,160,286,363],"multi-modalities":[31],"in":[32,39,292],"videos":[33,84],"still":[35],"unsolved":[37],"problem":[38],"tasks":[42,64],"action":[46,198,227],"recognition,":[47],"temporal":[49,70,356],"grounding,":[50,357],"description.":[53],"In":[54,79,346],"this":[55,158],"talk,":[56],"we":[57,81,146,272,358],"focus":[58],"on":[59,129,172,200,217,229,238,378],"two":[60,307,336],"specific":[61],"(i.e.,":[65],"cross-modal":[66,135,151],"self-supervised":[67],"pretraining":[68],"grounding)":[71],"by":[72,88,367],"exploiting":[73,303],"video-text":[75],"cross":[76],"modal":[77],"information.":[78,345],"particular,":[80],"notice":[82],"that":[83,236],"naturally":[86],"abundant":[89],"text":[90],"YouTube":[94],"titles,":[95],"Instagram":[96],"captions,":[97],"Movie":[99],"scripts.":[100],"This":[101,297],"textual":[102],"could":[104,118,329],"serve":[105],"a":[107,114,122,149,161,222,239,247,274,293,314],"general":[108,123,150],"guide":[111],"us":[112],"train":[113,168],"network,":[116],"which":[117,220,259],"be":[119,127,139],"used":[120,140],"representation":[125,333],"fine-tuned":[128],"downstream":[131,218],"tasks,":[132,219],"or":[133],"matching":[136,316],"similarity":[137],"segment":[143],"retrieval.":[144],"Specifically,":[145],"first":[147],"present":[148,273],"pair":[152],"discrimination":[153],"(CPD)":[154],"framework":[155,300],"capture":[157],"its":[164,186],"associated":[165],"text.":[166],"We":[167],"our":[169,208],"CPD":[170,234],"models":[171,193],"both":[173],"standard":[174],"dataset":[176,182,242],"(Kinetics-210k)":[177],"uncurated":[179],"web":[180],"(Instagram-300k)":[183],"demonstrate":[185],"effectiveness.":[187],"Without":[188],"further":[189],"fine-tuning,":[190],"learnt":[192],"obtain":[194],"competitive":[195],"results":[196],"classification":[199,205],"Kinetics":[201],"under":[202],"linear":[204],"protocol.":[206],"Moreover,":[207],"visual":[209],"provides":[211],"effective":[213],"initialization":[214],"fine-tune":[216],"yields":[221,375],"remarkable":[223],"performance":[224,249,377],"gain":[225],"recognition":[228],"UCF101":[230],"HMDB51.":[232],"Our":[233,373],"demonstrates":[235],"pre-training":[237],"relatively":[240,351],"small":[241],"able":[244],"yield":[246],"comparable":[248],"those":[251],"methods":[252],"of":[253,335,381],"using":[254],"order":[255],"magnitude":[256],"more":[257],"data,":[258],"meaningful":[261],"practicable":[263],"scenarios":[266],"limited":[268],"computational":[269],"facilities.":[270],"Second,":[271],"Contrastive":[275],"Compatible":[277],"Matching":[278],"Network":[279],"(C2M-Net),":[280],"directly":[282,368],"relations":[285],"language":[287],"queries":[288,364],"moments":[291,366],"joint":[294,332],"embedding":[295],"space.":[296],"new":[298,308,326],"metric-learning":[299],"enables":[301],"fully":[302],"negative":[304,311,320,327],"samples":[305,328],"from":[306,313],"aspects:":[309],"constructing":[310],"pairs":[312,321,353],"dual":[315],"scheme":[317],"mining":[319],"across":[322],"different":[323],"videos.":[324],"These":[325],"enhance":[330],"learning":[334,340],"modalities":[337],"via":[338],"contrastive":[339],"maximize":[342],"their":[343,370],"mutual":[344],"addition,":[347],"precisely":[349],"rank":[350],"positive":[352],"accurate":[355],"also":[359],"learn":[360],"compatibility":[362],"regressing":[369],"IoU-based":[371],"similarity.":[372],"C2M-Net":[374],"state-of-the-art":[376],"three":[379],"benchmarks":[380],"CharadesSTA,":[382],"TACoS,":[383],"ActivityNet-Captions.":[385]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
