{"id":"https://openalex.org/W4205312796","doi":"https://doi.org/10.1109/vcip53242.2021.9675348","title":"MAPS: Joint Multimodal Attention and POS Sequence Generation for Video Captioning","display_name":"MAPS: Joint Multimodal Attention and POS Sequence Generation for Video Captioning","publication_year":2021,"publication_date":"2021-12-05","ids":{"openalex":"https://openalex.org/W4205312796","doi":"https://doi.org/10.1109/vcip53242.2021.9675348"},"language":"en","primary_location":{"id":"doi:10.1109/vcip53242.2021.9675348","is_oa":false,"landing_page_url":"https://doi.org/10.1109/vcip53242.2021.9675348","pdf_url":null,"source":{"id":"https://openalex.org/S4363608378","display_name":"2021 International Conference on Visual Communications and Image Processing (VCIP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 International Conference on Visual Communications and Image Processing (VCIP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063463108","display_name":"Cong Zou","orcid":"https://orcid.org/0000-0002-1319-7143"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Cong Zou","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, PA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101615940","display_name":"Xuchen Wang","orcid":"https://orcid.org/0000-0002-0119-3268"},"institutions":[{"id":"https://openalex.org/I79576946","display_name":"University of Pennsylvania","ror":"https://ror.org/00b30xv10","country_code":"US","type":"education","lineage":["https://openalex.org/I79576946"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xuchen Wang","raw_affiliation_strings":["University of Pennsylvania, Philadelphia, PA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Pennsylvania, Philadelphia, PA, USA","institution_ids":["https://openalex.org/I79576946"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086836003","display_name":"Yaosi Hu","orcid":"https://orcid.org/0000-0003-2784-6738"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yaosi Hu","raw_affiliation_strings":["Wuhan University, Wuhan, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006748765","display_name":"Zhenzhong Chen","orcid":"https://orcid.org/0000-0002-7882-1066"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenzhong Chen","raw_affiliation_strings":["Wuhan University, Wuhan, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100449791","display_name":"Shan Liu","orcid":"https://orcid.org/0000-0002-1442-1207"},"institutions":[{"id":"https://openalex.org/I70745867","display_name":"KLA (United States)","ror":"https://ror.org/02rqhpa98","country_code":"US","type":"company","lineage":["https://openalex.org/I70745867"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shan Liu","raw_affiliation_strings":["Tencent America, Palo Alto, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tencent America, Palo Alto, CA, USA","institution_ids":["https://openalex.org/I70745867"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5063463108"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":0.1313,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.50272206,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.986956000328064},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8809137344360352},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5954923033714294},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5937725901603699},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5591828227043152},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.5062898397445679},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5012853145599365},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4950982630252838},{"id":"https://openalex.org/keywords/syntax","display_name":"Syntax","score":0.4823857843875885},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.47895222902297974},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.4767228662967682},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.4535272717475891},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4123665392398834},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.11537376046180725}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.986956000328064},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8809137344360352},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5954923033714294},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5937725901603699},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5591828227043152},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.5062898397445679},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5012853145599365},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4950982630252838},{"id":"https://openalex.org/C60048249","wikidata":"https://www.wikidata.org/wiki/Q37437","display_name":"Syntax","level":2,"score":0.4823857843875885},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.47895222902297974},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.4767228662967682},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.4535272717475891},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4123665392398834},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.11537376046180725},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/vcip53242.2021.9675348","is_oa":false,"landing_page_url":"https://doi.org/10.1109/vcip53242.2021.9675348","pdf_url":null,"source":{"id":"https://openalex.org/S4363608378","display_name":"2021 International Conference on Visual Communications and Image Processing (VCIP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 International Conference on Visual Communications and Image Processing (VCIP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.5699999928474426,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320316083","display_name":"Tencent","ror":"https://ror.org/00hhjss72"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W1522301498","https://openalex.org/W1522734439","https://openalex.org/W1586939924","https://openalex.org/W1601567445","https://openalex.org/W1889081078","https://openalex.org/W1996430422","https://openalex.org/W2130942839","https://openalex.org/W2139501017","https://openalex.org/W2164290393","https://openalex.org/W2425121537","https://openalex.org/W2554906389","https://openalex.org/W2739107216","https://openalex.org/W2951390634","https://openalex.org/W2962681491","https://openalex.org/W2963971014","https://openalex.org/W2964241990","https://openalex.org/W2964350391","https://openalex.org/W2984862483","https://openalex.org/W3034221024","https://openalex.org/W6620707391","https://openalex.org/W6631190155","https://openalex.org/W6639432524","https://openalex.org/W6679436768","https://openalex.org/W6680145277","https://openalex.org/W6684090549","https://openalex.org/W6694260854","https://openalex.org/W6779369746"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W3088136942","https://openalex.org/W2949362007","https://openalex.org/W2775506363","https://openalex.org/W4290852288","https://openalex.org/W4388893791","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W2167155152"],"abstract_inverted_index":{"Video":[0],"captioning":[1,21,168],"is":[2],"considered":[3],"to":[4,8,66],"be":[5],"challenging":[6],"due":[7],"the":[9,36,74,87,105,112,130,138,149,166],"combination":[10],"of":[11,28,42,115],"video":[12,20,70,141,167],"understanding":[13],"and":[14,32,39,78,96,102,126,142],"text":[15,143],"generation.":[16],"Recent":[17],"progress":[18],"in":[19,50,86],"has":[22],"been":[23],"made":[24],"mainly":[25],"using":[26],"methods":[27],"visual":[29],"feature":[30,151],"extraction":[31],"sequential":[33],"learning.":[34],"However,":[35],"syntax":[37],"structure":[38],"semantic":[40],"consistency":[41],"generated":[43],"captions":[44],"are":[45,82,99],"not":[46,134],"fully":[47,136],"explored.":[48],"Thus,":[49,162],"our":[51,163],"work,":[52],"we":[53],"propose":[54],"a":[55,155,158],"novel":[56],"multimodal":[57],"attention":[58,108],"based":[59],"framework":[60],"with":[61,104,157],"Part-of-Speech":[62],"(POS)":[63],"sequence":[64,76,80],"guidance":[65],"generate":[67],"more":[68],"accu-rate":[69],"captions.":[71,175],"In":[72],"general,":[73],"word":[75,156],"generation":[77],"POS":[79,106,160],"prediction":[81,116],"hierarchically":[83],"jointly":[84],"modeled":[85],"framework.":[88],"Specifically,":[89],"different":[90],"modalities":[91],"including":[92],"visual,":[93],"motion,":[94],"object":[95],"syntactic":[97],"features":[98],"adaptively":[100],"weighted":[101],"fused":[103],"guided":[107],"mechanism":[109],"when":[110,153],"computing":[111],"probability":[113],"distributions":[114],"words.":[117],"Experimental":[118],"results":[119],"on":[120,148],"two":[121],"benchmark":[122],"datasets,":[123],"i.e.":[124],"MSVD":[125],"MSR-VTT,":[127],"demonstrate":[128],"that":[129],"proposed":[131],"method":[132],"can":[133],"only":[135],"exploit":[137],"information":[139],"from":[140],"content,":[144],"but":[145],"also":[146],"focus":[147],"decisive":[150],"modality":[152],"generating":[154,173],"certain":[159],"type.":[161],"approach":[164],"boosts":[165],"performance":[169],"as":[170,172],"well":[171],"idiomatic":[174]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2022,"cited_by_count":1}],"updated_date":"2026-04-28T14:05:53.105641","created_date":"2025-10-10T00:00:00"}
