{"id":"https://openalex.org/W4391092623","doi":"https://doi.org/10.1109/tase.2024.3352903","title":"End-to-End Video Captioning Based on Multiview Semantic Alignment for Human\u2013Machine Fusion","display_name":"End-to-End Video Captioning Based on Multiview Semantic Alignment for Human\u2013Machine Fusion","publication_year":2024,"publication_date":"2024-01-22","ids":{"openalex":"https://openalex.org/W4391092623","doi":"https://doi.org/10.1109/tase.2024.3352903"},"language":"en","primary_location":{"id":"doi:10.1109/tase.2024.3352903","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tase.2024.3352903","pdf_url":null,"source":{"id":"https://openalex.org/S34881539","display_name":"IEEE Transactions on Automation Science and Engineering","issn_l":"1545-5955","issn":["1545-5955","1558-3783"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Automation Science and Engineering","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5066900102","display_name":"Shuai Wu","orcid":"https://orcid.org/0000-0001-7713-1808"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Shuai Wu","raw_affiliation_strings":["School of Computer Science, Fudan University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004359902","display_name":"Yubing Gao","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yubing Gao","raw_affiliation_strings":["Department of Automation, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Department of Automation, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101865874","display_name":"Weidong Yang","orcid":"https://orcid.org/0000-0002-6473-9272"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weidong Yang","raw_affiliation_strings":["School of Computer Science, Fudan University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101456272","display_name":"Hongkai Li","orcid":"https://orcid.org/0009-0003-2043-5415"},"institutions":[{"id":"https://openalex.org/I9842412","display_name":"Nanjing University of Aeronautics and Astronautics","ror":"https://ror.org/01scyh794","country_code":"CN","type":"education","lineage":["https://openalex.org/I9842412"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongkai Li","raw_affiliation_strings":["State Key Laboratory of Mechanics and Control for Aerospace Structures, Nanjing University of Aeronautics and Astronautics, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Mechanics and Control for Aerospace Structures, Nanjing University of Aeronautics and Astronautics, Nanjing, China","institution_ids":["https://openalex.org/I9842412"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5074542180","display_name":"Guangyu Zhu","orcid":"https://orcid.org/0000-0001-9591-7564"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangyu Zhu","raw_affiliation_strings":["Beijing Research Center of Urban Traffic Information Sensing and Service Technologies, Beijing Jiaotong University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Research Center of Urban Traffic Information Sensing and Service Technologies, Beijing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5066900102"],"corresponding_institution_ids":["https://openalex.org/I24943067"],"apc_list":null,"apc_paid":null,"fwci":1.3158,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.79702798,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":"22","issue":null,"first_page":"4682","last_page":"4690"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9105796217918396},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6997487545013428},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.6347317099571228},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6031189560890198},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.5267072319984436},{"id":"https://openalex.org/keywords/sensor-fusion","display_name":"Sensor fusion","score":0.46673712134361267},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.45221248269081116},{"id":"https://openalex.org/keywords/image-fusion","display_name":"Image fusion","score":0.4120023846626282},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.3520142436027527},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.1662745475769043}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9105796217918396},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6997487545013428},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6347317099571228},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6031189560890198},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.5267072319984436},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.46673712134361267},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.45221248269081116},{"id":"https://openalex.org/C69744172","wikidata":"https://www.wikidata.org/wiki/Q860822","display_name":"Image fusion","level":3,"score":0.4120023846626282},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.3520142436027527},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.1662745475769043},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tase.2024.3352903","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tase.2024.3352903","pdf_url":null,"source":{"id":"https://openalex.org/S34881539","display_name":"IEEE Transactions on Automation Science and Engineering","issn_l":"1545-5955","issn":["1545-5955","1558-3783"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Automation Science and Engineering","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":47,"referenced_works":["https://openalex.org/W1956340063","https://openalex.org/W2101105183","https://openalex.org/W2108325777","https://openalex.org/W2133512280","https://openalex.org/W2194775991","https://openalex.org/W2739107216","https://openalex.org/W2885138528","https://openalex.org/W2905145027","https://openalex.org/W2951390634","https://openalex.org/W2952132648","https://openalex.org/W2953461088","https://openalex.org/W2962934715","https://openalex.org/W2963524571","https://openalex.org/W2981411942","https://openalex.org/W2993346046","https://openalex.org/W2998166190","https://openalex.org/W3019301826","https://openalex.org/W3022580094","https://openalex.org/W3034221024","https://openalex.org/W3034730770","https://openalex.org/W3104862079","https://openalex.org/W3126721948","https://openalex.org/W3171688991","https://openalex.org/W3176689360","https://openalex.org/W3209229003","https://openalex.org/W3214654990","https://openalex.org/W3217340782","https://openalex.org/W3217578129","https://openalex.org/W4214612132","https://openalex.org/W4214692497","https://openalex.org/W4214862245","https://openalex.org/W4220790454","https://openalex.org/W4226162294","https://openalex.org/W4226269437","https://openalex.org/W4226494925","https://openalex.org/W4285221484","https://openalex.org/W4285606530","https://openalex.org/W4289812641","https://openalex.org/W4312463400","https://openalex.org/W4312560592","https://openalex.org/W4386066385","https://openalex.org/W6620707391","https://openalex.org/W6755207826","https://openalex.org/W6769627184","https://openalex.org/W6791353385","https://openalex.org/W6795711426","https://openalex.org/W6849520326"],"related_works":["https://openalex.org/W2132659060","https://openalex.org/W2031992971","https://openalex.org/W2788731446","https://openalex.org/W2204403038","https://openalex.org/W3214791684","https://openalex.org/W3152170969","https://openalex.org/W2139242969","https://openalex.org/W2379054866","https://openalex.org/W2549658594","https://openalex.org/W2353265673"],"abstract_inverted_index":{"Video":[0],"captioning":[1,49,108,224,324],"can":[2,149,208,259,308,327],"understand":[3,151,261,309],"videos,":[4],"provide":[5],"decision-makers":[6],"with":[7,45,72,104],"user-friendly":[8],"natural":[9,252],"language":[10,253],"narration,":[11],"alleviate":[12],"the":[13,46,68,73,87,105,111,147,166,169,184,215,222,249,256,262,275,281,290,310,315,329,333],"gap":[14],"between":[15],"man":[16],"and":[17,19,37,59,77,138,143,154,178,191,207,230,270,284,322,336],"machine,":[18],"promote":[20,227,271],"human-machine":[21,101,228,272],"interaction.":[22],"Therefore,":[23,293],"it":[24,218,266],"has":[25,201,285,337],"good":[26],"application":[27,330,339],"prospects":[28],"in":[29,233,319],"emergency":[30,234],"response":[31],"scenarios,":[32],"such":[33],"as":[34,123],"aerial":[35,189],"refueling":[36,190],"assisted":[38],"driving.":[39,193],"However,":[40,274],"there":[41,60],"are":[42,53,61],"two":[43,158,185],"problems":[44],"current":[47],"video":[48,76,96,107,121,137,152,204,212,223,250,282,291,299,311,317],"methods:":[50],"1)":[51,110],"they":[52],"mainly":[54],"oriented":[55],"to":[56,160,164,239,247,267,279],"general":[57],"domains,":[58],"few":[62],"studies":[63],"on":[64,181,183,303],"industrial":[65,334],"applications;":[66],"2)":[67,131],"methods":[69],"only":[70],"interact":[71],"semantics":[74],"of":[75,136,168,176,188,243,289,332],"text":[78,139,156],"from":[79,140,314],"a":[80,93,174,202],"single":[81],"view":[82],"(tokens":[83],"or":[84],"sentences).":[85,161],"For":[86],"above":[88],"problems,":[89],"this":[90,244,294],"paper":[91,245,295],"proposes":[92,296],"multi-view":[94,304],"end-to-end":[95,116,298],"caption":[97],"(MVVC)":[98],"method":[99,200,277,301],"for":[100,128],"fusion.":[102,273],"Compared":[103],"previous":[106],"methods,":[109],"MVVC":[112,182],"model":[113,117,148],"is":[114,246],"an":[115,286,297],"which":[118,307],"directly":[119,313],"takes":[120],"frames":[122],"input":[124],"without":[125],"object":[126],"detection":[127],"each":[129],"frame;":[130],"we":[132,172],"perform":[133],"cross-modal":[134],"interaction":[135],"both":[141],"local":[142],"global":[144],"views.":[145],"So":[146],"simultaneously":[150],"content":[153,312],"generate":[155,209],"at":[157],"granularities(tokens":[159],"In":[162],"order":[163],"verify":[165],"performance":[167],"new":[170,199],"model,":[171],"conducted":[173],"series":[175],"comparative":[177],"ablation":[179],"experiments":[180,195],"data":[186],"sets":[187],"automatic":[192],"The":[194],"show":[196],"that":[197,221,255],"our":[198],"stronger":[203],"understanding":[205,288],"ability":[206],"more":[210],"accurate":[211],"descriptions.":[213],"At":[214],"same":[216],"time,":[217],"also":[219],"verified":[220],"task":[225],"could":[226],"fusion":[229],"assist":[231],"decision-making":[232],"scenarios.":[235],"<italic":[236],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[237],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">Note":[238],"Practitioners</i>":[240],"\u2014The":[241],"motivation":[242],"convert":[248],"into":[251],"so":[254],"autonomous":[257],"system":[258],"automatically":[260],"observed":[263],"scene,":[264],"describe":[265],"relevant":[268],"stakeholders,":[269],"traditional":[276],"needs":[278],"process":[280],"offline":[283],"insufficient":[287],"content.":[292],"capture":[300],"based":[302],"semantic":[305],"alignment,":[306],"original":[316],"pixels":[318],"real":[320],"time":[321],"improve":[323],"accuracy.":[325],"It":[326],"meet":[328],"requirements":[331],"field":[335],"practical":[338],"value.":[340]},"counts_by_year":[{"year":2025,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
