{"id":"https://openalex.org/W4404562751","doi":"https://doi.org/10.1109/tcsvt.2024.3502621","title":"Learning Comprehensive Visual Grounding for Video Captioning","display_name":"Learning Comprehensive Visual Grounding for Video Captioning","publication_year":2024,"publication_date":"2024-11-20","ids":{"openalex":"https://openalex.org/W4404562751","doi":"https://doi.org/10.1109/tcsvt.2024.3502621"},"language":"en","primary_location":{"id":"doi:10.1109/tcsvt.2024.3502621","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2024.3502621","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030344445","display_name":"Wenhui Jiang","orcid":"https://orcid.org/0000-0002-4144-6725"},"institutions":[{"id":"https://openalex.org/I59649739","display_name":"Jiangxi University of Finance and Economics","ror":"https://ror.org/03efmyj29","country_code":"CN","type":"education","lineage":["https://openalex.org/I59649739"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wenhui Jiang","raw_affiliation_strings":["School of Computing and Artificial Intelligence, Jiangxi University of Finance and Economics, Nanchang, China"],"affiliations":[{"raw_affiliation_string":"School of Computing and Artificial Intelligence, Jiangxi University of Finance and Economics, Nanchang, China","institution_ids":["https://openalex.org/I59649739"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113151771","display_name":"Linxin Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I59649739","display_name":"Jiangxi University of Finance and Economics","ror":"https://ror.org/03efmyj29","country_code":"CN","type":"education","lineage":["https://openalex.org/I59649739"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Linxin Liu","raw_affiliation_strings":["School of Computing and Artificial Intelligence, Jiangxi University of Finance and Economics, Nanchang, China"],"affiliations":[{"raw_affiliation_string":"School of Computing and Artificial Intelligence, Jiangxi University of Finance and Economics, Nanchang, China","institution_ids":["https://openalex.org/I59649739"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063013411","display_name":"Yuming Fang","orcid":"https://orcid.org/0000-0002-6946-3586"},"institutions":[{"id":"https://openalex.org/I59649739","display_name":"Jiangxi University of Finance and Economics","ror":"https://ror.org/03efmyj29","country_code":"CN","type":"education","lineage":["https://openalex.org/I59649739"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuming Fang","raw_affiliation_strings":["School of Computing and Artificial Intelligence, Jiangxi University of Finance and Economics, Nanchang, China"],"affiliations":[{"raw_affiliation_string":"School of Computing and Artificial Intelligence, Jiangxi University of Finance and Economics, Nanchang, China","institution_ids":["https://openalex.org/I59649739"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101323065","display_name":"Yibo Cheng","orcid":null},"institutions":[{"id":"https://openalex.org/I59649739","display_name":"Jiangxi University of Finance and Economics","ror":"https://ror.org/03efmyj29","country_code":"CN","type":"education","lineage":["https://openalex.org/I59649739"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yibo Cheng","raw_affiliation_strings":["School of Computing and Artificial Intelligence, Jiangxi University of Finance and Economics, Nanchang, China"],"affiliations":[{"raw_affiliation_string":"School of Computing and Artificial Intelligence, Jiangxi University of Finance and Economics, Nanchang, China","institution_ids":["https://openalex.org/I59649739"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047811387","display_name":"Yuxin Peng","orcid":"https://orcid.org/0000-0001-7658-3845"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuxin Peng","raw_affiliation_strings":["Wangxuan Institute of Computer Technology, Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100355762","display_name":"Yang Liu","orcid":"https://orcid.org/0000-0001-9982-9887"},"institutions":[{"id":"https://openalex.org/I4210124264","display_name":"Sany (China)","ror":"https://ror.org/023jrwe36","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210124264"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang Liu","raw_affiliation_strings":["SANY Heavy Industry Company Ltd., Beijing, China","SANY Heavy Industry Co., Ltd, Beijing, China"],"affiliations":[{"raw_affiliation_string":"SANY Heavy Industry Company Ltd., Beijing, China","institution_ids":["https://openalex.org/I4210124264"]},{"raw_affiliation_string":"SANY Heavy Industry Co., Ltd, Beijing, China","institution_ids":["https://openalex.org/I4210124264"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5030344445"],"corresponding_institution_ids":["https://openalex.org/I59649739"],"apc_list":null,"apc_paid":null,"fwci":0.98,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.77779541,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"35","issue":"4","first_page":"3355","last_page":"3367"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9905999898910522,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9864000082015991,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9225108623504639},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7851575613021851},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5274959802627563},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4881508946418762},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.46748846769332886},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.46464821696281433},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.365379273891449},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3322393298149109},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.27608269453048706}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9225108623504639},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7851575613021851},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5274959802627563},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4881508946418762},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.46748846769332886},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.46464821696281433},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.365379273891449},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3322393298149109},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.27608269453048706}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcsvt.2024.3502621","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2024.3502621","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5699999928474426,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G1583650166","display_name":null,"funder_award_id":"62132006","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G206987818","display_name":null,"funder_award_id":"62441203","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2389567385","display_name":null,"funder_award_id":"20242BAB23012","funder_id":"https://openalex.org/F4320322665","funder_display_name":"Natural Science Foundation of Jiangxi Province"},{"id":"https://openalex.org/G3944859999","display_name":null,"funder_award_id":"62311530101","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4341765762","display_name":null,"funder_award_id":"62461028","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4364626826","display_name":null,"funder_award_id":"jxsq2023101092","funder_id":"https://openalex.org/F3692415409","funder_display_name":"Double Thousand Plan of Jiangxi Province"},{"id":"https://openalex.org/G5883946564","display_name":null,"funder_award_id":"62161013","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F3692415409","display_name":"Double Thousand Plan of Jiangxi Province","ror":null},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322665","display_name":"Natural Science Foundation of Jiangxi Province","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":55,"referenced_works":["https://openalex.org/W1586939924","https://openalex.org/W1956340063","https://openalex.org/W2101105183","https://openalex.org/W2133459682","https://openalex.org/W2425121537","https://openalex.org/W2506483933","https://openalex.org/W2765658575","https://openalex.org/W2905145027","https://openalex.org/W2953461088","https://openalex.org/W2963351113","https://openalex.org/W2963524571","https://openalex.org/W2963630207","https://openalex.org/W2968101724","https://openalex.org/W2969557970","https://openalex.org/W3034221024","https://openalex.org/W3034655362","https://openalex.org/W3035323998","https://openalex.org/W3035763695","https://openalex.org/W3134875898","https://openalex.org/W3163971663","https://openalex.org/W3174441232","https://openalex.org/W3193767255","https://openalex.org/W3217340782","https://openalex.org/W4214692497","https://openalex.org/W4220790454","https://openalex.org/W4221166385","https://openalex.org/W4224920290","https://openalex.org/W4226396383","https://openalex.org/W4285118488","https://openalex.org/W4285265382","https://openalex.org/W4312321660","https://openalex.org/W4312560592","https://openalex.org/W4377235499","https://openalex.org/W4382464395","https://openalex.org/W4382467086","https://openalex.org/W4384161812","https://openalex.org/W4385768107","https://openalex.org/W4385825476","https://openalex.org/W4386066129","https://openalex.org/W4386075721","https://openalex.org/W4386076176","https://openalex.org/W4386234463","https://openalex.org/W4387385612","https://openalex.org/W4387934928","https://openalex.org/W4390871765","https://openalex.org/W4390874374","https://openalex.org/W4393158567","https://openalex.org/W4393159389","https://openalex.org/W4394597470","https://openalex.org/W4402727018","https://openalex.org/W4402780073","https://openalex.org/W6682631176","https://openalex.org/W6684090549","https://openalex.org/W6839745749","https://openalex.org/W6864544085"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W4290852288","https://openalex.org/W2949362007","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W2949522393","https://openalex.org/W4289422896"],"abstract_inverted_index":{"The":[0,12,114,131],"grounding":[1,32,81,110,117,133,157,160,171,176,187,204],"accuracy":[2,45],"of":[3,14,100,107,220],"existing":[4,15],"video":[5,19,61,85,129],"captioners":[6],"is":[7,178],"still":[8],"behind":[9],"the":[10,43,53,68,94,104,119,136,142,175,199],"expectation.":[11],"majority":[13],"methods":[16],"perform":[17],"grounded":[18,26],"captioning":[20,27,44],"on":[21,30,52,124,211,223,227,232],"sparse":[22],"entity":[23,109,116,156],"annotations.":[24,102],"However,":[25],"models":[28,65],"rely":[29],"deliberate":[31],"annotations":[33,182],"as":[34,57,163],"supervision,":[35],"which":[36,145],"are":[37,161],"relatively":[38],"hard":[39],"to":[40,83,96,122,138,198,235],"obtain.":[41],"Moreover,":[42],"often":[46],"suffers":[47],"from":[48],"degenerated":[49],"object":[50],"appearances":[51],"annotated":[54],"area":[55],"such":[56],"motion":[58],"blur":[59],"and":[60,63,111,141,149,158,215,229],"defocus,":[62],"these":[64],"seldom":[66],"consider":[67],"complex":[69],"interactions":[70],"among":[71],"entities.":[72],"In":[73],"this":[74],"paper,":[75],"we":[76],"propose":[77],"a":[78,164,169,186],"comprehensive":[79],"visual":[80],"network":[82,105],"improve":[84],"captioning,":[86],"by":[87,168,180,185],"using":[88],"inexpensive":[89],"pseudo":[90,181],"annotation":[91,188,205],"while":[92],"avoiding":[93],"need":[95],"collect":[97],"large":[98],"amounts":[99],"manual":[101],"Specifically,":[103],"consists":[106],"spatial-temporal":[108],"action":[112,132,153,159],"grounding.":[113],"proposed":[115],"encourages":[118],"attention":[120],"mechanism":[121],"focus":[123],"informative":[125],"spatial":[126,148],"areas":[127],"across":[128],"frames.":[130],"dynamically":[134],"associates":[135],"verbs":[137],"related":[139],"subjects":[140],"corresponding":[143],"context,":[144],"keeps":[146],"fine-grained":[147],"temporal":[150],"details":[151],"for":[152],"prediction.":[154],"Both":[155],"formulated":[162],"unified":[165],"task":[166],"guided":[167],"soft":[170],"supervision.":[172],"More":[173],"importantly,":[174],"objective":[177],"supervised":[179],"automatically":[183],"produced":[184],"generation":[189],"module,":[190],"thus":[191],"our":[192],"model":[193],"can":[194],"be":[195],"easily":[196],"applied":[197],"challenging":[200],"dataset":[201],"without":[202],"any":[203],"provided.":[206],"We":[207],"conduct":[208],"extensive":[209],"experiments":[210],"three":[212],"benchmark":[213],"datasets":[214],"demonstrate":[216],"significant":[217],"performance":[218],"improvements":[219],"+2.4":[221],"CIDEr":[222,226,231],"MSR-VTT,":[224],"+4.7":[225],"MSVD,":[228],"+5.1":[230],"ActivityNet-Entities":[233],"compared":[234],"state-of-the-arts.":[236]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
