{"id":"https://openalex.org/W4402351150","doi":"https://doi.org/10.1109/tpami.2024.3410329","title":"Cap4Video++: Enhancing Video Understanding With Auxiliary Captions","display_name":"Cap4Video++: Enhancing Video Understanding With Auxiliary Captions","publication_year":2024,"publication_date":"2024-09-09","ids":{"openalex":"https://openalex.org/W4402351150","doi":"https://doi.org/10.1109/tpami.2024.3410329","pmid":"https://pubmed.ncbi.nlm.nih.gov/39250359"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2024.3410329","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2024.3410329","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015819673","display_name":"Wenhao Wu","orcid":"https://orcid.org/0000-0002-8511-743X"},"institutions":[{"id":"https://openalex.org/I129604602","display_name":"The University of Sydney","ror":"https://ror.org/0384j8v12","country_code":"AU","type":"education","lineage":["https://openalex.org/I129604602"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Wenhao Wu","raw_affiliation_strings":["School of Computer Science, The University of Sydney, Camperdown, NSW, Australia","School of Computer Science, The University of Sydney, NSW, Australia"],"raw_orcid":"https://orcid.org/0000-0002-8511-743X","affiliations":[{"raw_affiliation_string":"School of Computer Science, The University of Sydney, Camperdown, NSW, Australia","institution_ids":["https://openalex.org/I129604602"]},{"raw_affiliation_string":"School of Computer Science, The University of Sydney, NSW, Australia","institution_ids":["https://openalex.org/I129604602"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100354377","display_name":"Xiaohan Wang","orcid":"https://orcid.org/0000-0001-6206-7911"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaohan Wang","raw_affiliation_strings":["Stanford University, Stanford, CA, USA","Stanford University, CA, USA"],"raw_orcid":"https://orcid.org/0000-0001-6206-7911","affiliations":[{"raw_affiliation_string":"Stanford University, Stanford, CA, USA","institution_ids":["https://openalex.org/I97018004"]},{"raw_affiliation_string":"Stanford University, CA, USA","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Haipeng Luo","orcid":"https://orcid.org/0009-0006-9625-2588"},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haipeng Luo","raw_affiliation_strings":["University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0006-9625-2588","affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075880303","display_name":"Jingdong Wang","orcid":"https://orcid.org/0000-0002-4888-4445"},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingdong Wang","raw_affiliation_strings":["Department of Computer Vision Technology, Baidu Inc., Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-4888-4445","affiliations":[{"raw_affiliation_string":"Department of Computer Vision Technology, Baidu Inc., Beijing, China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005421447","display_name":"Yi Yang","orcid":"https://orcid.org/0000-0002-0512-880X"},"institutions":[{"id":"https://openalex.org/I168879160","display_name":"Zhejiang University of Science and Technology","ror":"https://ror.org/05mx0wr29","country_code":"CN","type":"education","lineage":["https://openalex.org/I168879160"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Yang","raw_affiliation_strings":["College of Computer Science and Technology, Zhejiang University, Zhejiang, China"],"raw_orcid":"https://orcid.org/0000-0002-0512-880X","affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, Zhejiang University, Zhejiang, China","institution_ids":["https://openalex.org/I168879160"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5087818121","display_name":"Wanli Ouyang","orcid":"https://orcid.org/0000-0002-9163-2761"},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wanli Ouyang","raw_affiliation_strings":["Shanghai Artificial Intelligence Laboratory, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-9163-2761","affiliations":[{"raw_affiliation_string":"Shanghai Artificial Intelligence Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4391012619"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5015819673"],"corresponding_institution_ids":["https://openalex.org/I129604602"],"apc_list":null,"apc_paid":null,"fwci":8.0043,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.97296438,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":"47","issue":"7","first_page":"5223","last_page":"5237"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9871000051498413,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12720","display_name":"Multimedia Communication and Technology","score":0.954200029373169,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.736961305141449},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5845194458961487},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4679614007472992},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.39814338088035583},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.36640506982803345},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.35003921389579773}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.736961305141449},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5845194458961487},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4679614007472992},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.39814338088035583},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.36640506982803345},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.35003921389579773}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tpami.2024.3410329","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2024.3410329","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:39250359","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/39250359","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G6370828123","display_name":null,"funder_award_id":"DP200103223","funder_id":"https://openalex.org/F4320334704","funder_display_name":"Australian Research Council"}],"funders":[{"id":"https://openalex.org/F4320334704","display_name":"Australian Research Council","ror":"https://ror.org/05mmh0f86"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":96,"referenced_works":["https://openalex.org/W1927052826","https://openalex.org/W2126579184","https://openalex.org/W2425121537","https://openalex.org/W2507009361","https://openalex.org/W2526417872","https://openalex.org/W2883429621","https://openalex.org/W2904378456","https://openalex.org/W2962843773","https://openalex.org/W2963017553","https://openalex.org/W2963091558","https://openalex.org/W2963155035","https://openalex.org/W2963689837","https://openalex.org/W2963820951","https://openalex.org/W2984008963","https://openalex.org/W2989322838","https://openalex.org/W2990503944","https://openalex.org/W2996901793","https://openalex.org/W3010010212","https://openalex.org/W3021397474","https://openalex.org/W3034572008","https://openalex.org/W3035141718","https://openalex.org/W3035254087","https://openalex.org/W3035303837","https://openalex.org/W3035356601","https://openalex.org/W3035635319","https://openalex.org/W3043840704","https://openalex.org/W3126721948","https://openalex.org/W3131500599","https://openalex.org/W3138516171","https://openalex.org/W3168640669","https://openalex.org/W3172942063","https://openalex.org/W3174568846","https://openalex.org/W3174873881","https://openalex.org/W3175528717","https://openalex.org/W3176125528","https://openalex.org/W3190981030","https://openalex.org/W3204588463","https://openalex.org/W3207340843","https://openalex.org/W4214612132","https://openalex.org/W4214614183","https://openalex.org/W4214661601","https://openalex.org/W4214727094","https://openalex.org/W4214746887","https://openalex.org/W4221142658","https://openalex.org/W4225414521","https://openalex.org/W4285606530","https://openalex.org/W4304014690","https://openalex.org/W4312254032","https://openalex.org/W4312266966","https://openalex.org/W4312299780","https://openalex.org/W4312302951","https://openalex.org/W4312372711","https://openalex.org/W4312480274","https://openalex.org/W4312558481","https://openalex.org/W4312560592","https://openalex.org/W4312614039","https://openalex.org/W4312658081","https://openalex.org/W4312938727","https://openalex.org/W4312999114","https://openalex.org/W4382467086","https://openalex.org/W4385245566","https://openalex.org/W4386065852","https://openalex.org/W4386072365","https://openalex.org/W4386526950","https://openalex.org/W4390871944","https://openalex.org/W4390872434","https://openalex.org/W4402671548","https://openalex.org/W6600983433","https://openalex.org/W6631190155","https://openalex.org/W6682864246","https://openalex.org/W6729814214","https://openalex.org/W6739622702","https://openalex.org/W6754337694","https://openalex.org/W6766582784","https://openalex.org/W6766904570","https://openalex.org/W6778883912","https://openalex.org/W6784184991","https://openalex.org/W6784333009","https://openalex.org/W6790019176","https://openalex.org/W6790690058","https://openalex.org/W6791353385","https://openalex.org/W6796716886","https://openalex.org/W6797148833","https://openalex.org/W6801013943","https://openalex.org/W6801567822","https://openalex.org/W6803545775","https://openalex.org/W6803872405","https://openalex.org/W6803953248","https://openalex.org/W6810042059","https://openalex.org/W6810090734","https://openalex.org/W6810471921","https://openalex.org/W6811072154","https://openalex.org/W6839276459","https://openalex.org/W6843148291","https://openalex.org/W6849177959","https://openalex.org/W6955071965"],"related_works":["https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2772917594","https://openalex.org/W2775347418","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Understanding":[0],"videos,":[1],"especially":[2],"aligning":[3],"them":[4],"with":[5],"textual":[6],"data,":[7],"presents":[8],"a":[9,60,145],"significant":[10],"challenge":[11],"in":[12,26,37,102,182],"computer":[13],"vision.":[14],"The":[15],"advent":[16],"of":[17,77,88],"vision-language":[18,89],"models":[19,80,90,95],"(VLMs)":[20,91],"like":[21,82],"CLIP":[22],"has":[23],"sparked":[24],"interest":[25],"leveraging":[27],"their":[28],"capabilities":[29],"for":[30],"enhanced":[31],"video":[32,52,69,99,137,153,166,189],"understanding,":[33],"showing":[34],"marked":[35],"advancements":[36],"both":[38],"performance":[39],"and":[40,92,129,138,165],"efficiency.":[41],"However,":[42],"current":[43],"methods":[44],"often":[45],"neglect":[46],"vital":[47],"user-generated":[48],"metadata":[49],"such":[50],"as":[51],"titles.":[53],"In":[54],"this":[55],"paper,":[56],"we":[57,73],"present":[58],"Cap4Video++,":[59],"universal":[61],"framework":[62],"that":[63],"leverages":[64],"auxiliary":[65],"captions":[66,186],"to":[67,97,113,135,187],"enrich":[68],"understanding.":[70,190],"More":[71],"recently,":[72],"witness":[74],"the":[75,86,151],"flourishing":[76],"large":[78,93],"language":[79,94],"(LLMs)":[81,96],"ChatGPT.":[83],"Cap4Video++":[84],"harnesses":[85],"synergy":[87],"generate":[98],"captions,":[100,118],"utilized":[101],"three":[103],"key":[104],"phases:":[105],"(i)":[106],"Input":[107],"stage":[108,124,143],"employs":[109],"Semantic":[110],"Pair":[111],"Sampling":[112],"extract":[114],"beneficial":[115],"samples":[116],"from":[117],"aiding":[119],"contrastive":[120],"learning.":[121],"(ii)":[122],"Intermediate":[123],"sees":[125],"Video-Caption":[126],"Cross-modal":[127],"Interaction":[128],"Adaptive":[130],"Caption":[131],"Selection":[132],"work":[133],"together":[134],"bolster":[136],"caption":[139],"representations.":[140],"(iii)":[141],"Output":[142],"introduces":[144],"Complementary":[146],"Caption-Text":[147],"Matching":[148],"branch,":[149],"enhancing":[150],"primary":[152],"branch":[154],"by":[155],"improving":[156],"similarity":[157],"calculations.":[158],"Our":[159],"comprehensive":[160],"experiments":[161],"on":[162],"text-video":[163],"retrieval":[164],"action":[167],"recognition":[168],"across":[169],"nine":[170],"benchmarks":[171],"clearly":[172],"demonstrate":[173],"Cap4Video++'s":[174],"superiority":[175],"over":[176],"existing":[177],"models,":[178],"highlighting":[179],"its":[180],"effectiveness":[181],"utilizing":[183],"automatically":[184],"generated":[185],"advance":[188]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
