{"id":"https://openalex.org/W4389776363","doi":"https://doi.org/10.1109/tcsvt.2023.3343520","title":"Cascade Semantic Prompt Alignment Network for Image Captioning","display_name":"Cascade Semantic Prompt Alignment Network for Image Captioning","publication_year":2023,"publication_date":"2023-12-15","ids":{"openalex":"https://openalex.org/W4389776363","doi":"https://doi.org/10.1109/tcsvt.2023.3343520"},"language":"en","primary_location":{"id":"doi:10.1109/tcsvt.2023.3343520","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2023.3343520","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Jingyu Li","orcid":"https://orcid.org/0000-0002-9561-7550"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jingyu Li","raw_affiliation_strings":["School of Cyberspace Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-9561-7550","affiliations":[{"raw_affiliation_string":"School of Cyberspace Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100616189","display_name":"Lei Zhang","orcid":"https://orcid.org/0000-0002-2839-8693"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Zhang","raw_affiliation_strings":["School of Information Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100342380","display_name":"Kun Zhang","orcid":"https://orcid.org/0000-0003-2140-2546"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kun Zhang","raw_affiliation_strings":["School of Information Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0003-2140-2546","affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081768707","display_name":"Bo Hu","orcid":"https://orcid.org/0000-0002-6540-5119"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Hu","raw_affiliation_strings":["School of Information Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078162380","display_name":"Hongtao Xie","orcid":"https://orcid.org/0000-0002-6249-5315"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongtao Xie","raw_affiliation_strings":["School of Information Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-6249-5315","affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5023341829","display_name":"Zhendong Mao","orcid":"https://orcid.org/0000-0001-5739-8126"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhendong Mao","raw_affiliation_strings":["School of Cyberspace Science and Technology, University of Science and Technology of China, Hefei, China","School of Cyberspace Science and Technology, University of Science and Technology of China, and the Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0001-5739-8126","affiliations":[{"raw_affiliation_string":"School of Cyberspace Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"School of Cyberspace Science and Technology, University of Science and Technology of China, and the Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":1.6485,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.86498656,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"34","issue":"7","first_page":"5266","last_page":"5281"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9667999744415283,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9513999819755554,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8601820468902588},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.8369436860084534},{"id":"https://openalex.org/keywords/semantic-compression","display_name":"Semantic compression","score":0.7286508083343506},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6900041103363037},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6436458826065063},{"id":"https://openalex.org/keywords/semantic-computing","display_name":"Semantic computing","score":0.6246026158332825},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5434733629226685},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5282052159309387},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.49729397892951965},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4587462544441223},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.4109709858894348},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.33459728956222534},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3136964440345764},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.23323547840118408},{"id":"https://openalex.org/keywords/semantic-technology","display_name":"Semantic technology","score":0.21942192316055298},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.08138695359230042}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8601820468902588},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.8369436860084534},{"id":"https://openalex.org/C202708506","wikidata":"https://www.wikidata.org/wiki/Q7449050","display_name":"Semantic compression","level":5,"score":0.7286508083343506},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6900041103363037},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6436458826065063},{"id":"https://openalex.org/C511149849","wikidata":"https://www.wikidata.org/wiki/Q7449051","display_name":"Semantic computing","level":3,"score":0.6246026158332825},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5434733629226685},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5282052159309387},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.49729397892951965},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4587462544441223},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.4109709858894348},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.33459728956222534},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3136964440345764},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.23323547840118408},{"id":"https://openalex.org/C6881194","wikidata":"https://www.wikidata.org/wiki/Q7449091","display_name":"Semantic technology","level":4,"score":0.21942192316055298},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.08138695359230042},{"id":"https://openalex.org/C2129575","wikidata":"https://www.wikidata.org/wiki/Q54837","display_name":"Semantic Web","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcsvt.2023.3343520","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2023.3343520","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G8374609628","display_name":null,"funder_award_id":"U19A2057","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8802072536","display_name":null,"funder_award_id":"62222212","funder_id":"https://openalex.org/F4320333688","funder_display_name":"National Outstanding Youth Science Fund Project of National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320333688","display_name":"National Outstanding Youth Science Fund Project of National Natural Science Foundation of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":109,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W1905882502","https://openalex.org/W1956340063","https://openalex.org/W2101105183","https://openalex.org/W2133512280","https://openalex.org/W2154652894","https://openalex.org/W2187089797","https://openalex.org/W2277195237","https://openalex.org/W2506483933","https://openalex.org/W2745461083","https://openalex.org/W2890531016","https://openalex.org/W2896457183","https://openalex.org/W2904565150","https://openalex.org/W2963084599","https://openalex.org/W2963101956","https://openalex.org/W2966162142","https://openalex.org/W2981165461","https://openalex.org/W2983943451","https://openalex.org/W2986670728","https://openalex.org/W3006871679","https://openalex.org/W3018388102","https://openalex.org/W3034655362","https://openalex.org/W3035284526","https://openalex.org/W3035323998","https://openalex.org/W3091588028","https://openalex.org/W3092462694","https://openalex.org/W3093374703","https://openalex.org/W3096609285","https://openalex.org/W3104279398","https://openalex.org/W3134272453","https://openalex.org/W3136792391","https://openalex.org/W3138516171","https://openalex.org/W3153469116","https://openalex.org/W3166396011","https://openalex.org/W3167939936","https://openalex.org/W3173220247","https://openalex.org/W3174377922","https://openalex.org/W3174396556","https://openalex.org/W3175824375","https://openalex.org/W3181158454","https://openalex.org/W3185341429","https://openalex.org/W3191818374","https://openalex.org/W3193402170","https://openalex.org/W3195680250","https://openalex.org/W3205607545","https://openalex.org/W3205765769","https://openalex.org/W3214007456","https://openalex.org/W3214685499","https://openalex.org/W4206982465","https://openalex.org/W4221143761","https://openalex.org/W4221147537","https://openalex.org/W4225517085","https://openalex.org/W4282968790","https://openalex.org/W4283271696","https://openalex.org/W4283793362","https://openalex.org/W4285186657","https://openalex.org/W4285197287","https://openalex.org/W4285602612","https://openalex.org/W4285606316","https://openalex.org/W4288083516","https://openalex.org/W4289785045","https://openalex.org/W4296406182","https://openalex.org/W4304080274","https://openalex.org/W4304092583","https://openalex.org/W4307079201","https://openalex.org/W4307106676","https://openalex.org/W4308503721","https://openalex.org/W4312289196","https://openalex.org/W4312563197","https://openalex.org/W4312563428","https://openalex.org/W4312873085","https://openalex.org/W4312922092","https://openalex.org/W4312924260","https://openalex.org/W4312956471","https://openalex.org/W4313131769","https://openalex.org/W4313680273","https://openalex.org/W4318718936","https://openalex.org/W4319777846","https://openalex.org/W4320458302","https://openalex.org/W4320487288","https://openalex.org/W4323570346","https://openalex.org/W4327810635","https://openalex.org/W4366352717","https://openalex.org/W4376312115","https://openalex.org/W4382240730","https://openalex.org/W4386072307","https://openalex.org/W4386566587","https://openalex.org/W6682631176","https://openalex.org/W6755207826","https://openalex.org/W6763643401","https://openalex.org/W6778883912","https://openalex.org/W6784094891","https://openalex.org/W6791353385","https://openalex.org/W6798350552","https://openalex.org/W6799822930","https://openalex.org/W6800139874","https://openalex.org/W6803567076","https://openalex.org/W6804095316","https://openalex.org/W6809536152","https://openalex.org/W6841366371","https://openalex.org/W6842585177","https://openalex.org/W6846404632","https://openalex.org/W6847076894","https://openalex.org/W6848509895","https://openalex.org/W6849177959","https://openalex.org/W6850025790","https://openalex.org/W6850204008","https://openalex.org/W6851149231","https://openalex.org/W6853116092"],"related_works":["https://openalex.org/W2026635458","https://openalex.org/W3009270862","https://openalex.org/W2018066836","https://openalex.org/W3012445987","https://openalex.org/W3134365128","https://openalex.org/W2038882528","https://openalex.org/W2277021700","https://openalex.org/W2035647422","https://openalex.org/W3009520654","https://openalex.org/W2250607250"],"abstract_inverted_index":{"Image":[0],"captioning":[1,289],"(IC)":[2],"takes":[3],"an":[4,29,96,151],"image":[5,134],"as":[6],"input":[7],"and":[8,37,61,68,93,103,116,159,178,193,207,239,253,273],"generates":[9],"open-form":[10,97],"descriptions":[11],"in":[12,42,81,106,267],"the":[13,20,32,35,39,72,76,107,118,123,129,174,199,213,223,248,259,284],"domain":[14,108],"of":[15,22,25,31,34,71,100,109,242,279],"natural":[16,110],"language.":[17,111],"IC":[18],"requires":[19],"detection":[21,59],"objects,":[23],"modeling":[24],"relations":[26,200],"between":[27,64,201],"them,":[28],"assessment":[30],"semantics":[33],"scene":[36],"representing":[38],"extracted":[40],"knowledge":[41,88,121],"a":[43,141,231,276],"language":[44,120,214],"space.":[45],"Previous":[46],"detector-based":[47],"models":[48,84],"suffer":[49],"from":[50,122],"limited":[51],"semantic":[52,62,98,162,176,183,195],"perception":[53],"capability":[54],"due":[55],"to":[56,114,127,149,167,181,197,286],"predefined":[57],"object":[58,175,185,191,206],"classes":[60],"inconsistency":[63],"visual":[65,101],"region":[66,168,179],"features":[67,180,192,209],"numeric":[69],"labels":[70],"detector.":[73],"Inspired":[74],"by":[75],"fact":[77],"that":[78,227],"text":[79],"prompts":[80],"pre-trained":[82,124],"multi-modal":[83],"contain":[85],"specific":[86],"linguistic":[87],"rather":[89],"than":[90],"discrete":[91],"labels,":[92],"excel":[94],"at":[95,302],"understanding":[99],"inputs":[102],"their":[104],"representation":[105],"We":[112],"aim":[113],"distill":[115],"leverage":[117],"transferable":[119],"RegionCLIP":[125],"model":[126],"remedy":[128],"detector":[130],"for":[131],"generating":[132,216,268],"rich":[133,158,217],"captioning.":[135],"In":[136],"this":[137],"paper,":[138],"we":[139,171,188,282],"propose":[140],"novel":[142],"Cascade":[143],"Semantic":[144],"Prompt":[145],"Alignment":[146],"Network":[147],"(CSA-Net)":[148],"produce":[150,182],"aligned":[152],"fine-grained":[153],"regional":[154],"semantic-visual":[155],"space":[156],"where":[157],"consistent":[160],"textual":[161],"details":[163],"are":[164,210],"automatically":[165],"incorporated":[166],"features.":[169,186],"Specifically,":[170],"first":[172],"align":[173],"prompt":[177,196],"grounded":[184],"Then,":[187],"employ":[189],"these":[190,204],"relation":[194,208],"predict":[198],"objects.":[202],"Finally,":[203],"enhanced":[205],"fed":[211],"into":[212],"decoder,":[215],"descriptions.":[218],"Extensive":[219],"experiments":[220],"conducted":[221],"on":[222,247,258],"MSCOCO":[224],"dataset":[225],"show":[226],"our":[228],"method":[229],"achieves":[230],"new":[232],"state-of-the-art":[233],"performance":[234],"with":[235,270],"145.2%":[236],"(single":[237],"model)":[238],"147.0%":[240],"(ensemble":[241],"4":[243],"models)":[244],"CIDEr":[245,256],"scores":[246,257],"\u2018Karpathy\u2019":[249],"split,":[250],"141.6%":[251],"(c5)":[252],"144.1%":[254],"(c40)":[255],"official":[260],"online":[261],"test":[262],"server.":[263],"Significantly,":[264],"CSA-Net":[265,294],"outperforms":[266],"captions":[269],"higher":[271],"quality":[272],"diversity,":[274],"achieving":[275],"RefCLIP-S":[277],"score":[278],"83.2.":[280],"Moreover,":[281],"expand":[283],"testbeds":[285],"other":[287],"challenging":[288],"benchmarks,":[290],"i.e.,":[291],"nocaps":[292],"datasets,":[293],"demonstrates":[295],"superior":[296],"zero-shot":[297],"capability.":[298],"Source":[299],"codes":[300],"released":[301],"https://github.com/CrossmodalGroup/CSA-Net.":[303]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
