{"id":"https://openalex.org/W4387968283","doi":"https://doi.org/10.1145/3581783.3612480","title":"Improving Image Captioning through Visual and Semantic Mutual Promotion","display_name":"Improving Image Captioning through Visual and Semantic Mutual Promotion","publication_year":2023,"publication_date":"2023-10-26","ids":{"openalex":"https://openalex.org/W4387968283","doi":"https://doi.org/10.1145/3581783.3612480"},"language":"en","primary_location":{"id":"doi:10.1145/3581783.3612480","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612480","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100345320","display_name":"Jing Zhang","orcid":"https://orcid.org/0000-0001-6270-7771"},"institutions":[{"id":"https://openalex.org/I143593769","display_name":"East China University of Science and Technology","ror":"https://ror.org/01vyrm377","country_code":"CN","type":"education","lineage":["https://openalex.org/I143593769"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jing Zhang","raw_affiliation_strings":["East China University of Science and Technology, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"East China University of Science and Technology, Shanghai, China","institution_ids":["https://openalex.org/I143593769"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081492347","display_name":"Yingshuai Xie","orcid":"https://orcid.org/0000-0001-8752-4672"},"institutions":[{"id":"https://openalex.org/I143593769","display_name":"East China University of Science and Technology","ror":"https://ror.org/01vyrm377","country_code":"CN","type":"education","lineage":["https://openalex.org/I143593769"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yingshuai Xie","raw_affiliation_strings":["East China University of Science and Technology, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"East China University of Science and Technology, Shanghai, China","institution_ids":["https://openalex.org/I143593769"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5072723528","display_name":"Xiaoqiang Liu","orcid":"https://orcid.org/0000-0001-8342-5896"},"institutions":[{"id":"https://openalex.org/I143593769","display_name":"East China University of Science and Technology","ror":"https://ror.org/01vyrm377","country_code":"CN","type":"education","lineage":["https://openalex.org/I143593769"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoqiang Liu","raw_affiliation_strings":["East China University of Science and Technology, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"East China University of Science and Technology, Shanghai, China","institution_ids":["https://openalex.org/I143593769"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100345320"],"corresponding_institution_ids":["https://openalex.org/I143593769"],"apc_list":null,"apc_paid":null,"fwci":0.4776,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.65693516,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"4716","last_page":"4724"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7900390625},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.6891536116600037},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5963318943977356},{"id":"https://openalex.org/keywords/mutual-information","display_name":"Mutual information","score":0.5560339689254761},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5246087312698364},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.49752405285835266},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.379395455121994},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3644808530807495},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.24035143852233887}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7900390625},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.6891536116600037},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5963318943977356},{"id":"https://openalex.org/C152139883","wikidata":"https://www.wikidata.org/wiki/Q252973","display_name":"Mutual information","level":2,"score":0.5560339689254761},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5246087312698364},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.49752405285835266},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.379395455121994},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3644808530807495},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.24035143852233887},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3581783.3612480","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612480","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G6893319343","display_name":null,"funder_award_id":"22ZR1418400","funder_id":"https://openalex.org/F4320309612","funder_display_name":"Natural Science Foundation of Shanghai"}],"funders":[{"id":"https://openalex.org/F4320309612","display_name":"Natural Science Foundation of Shanghai","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W1895577753","https://openalex.org/W1905882502","https://openalex.org/W1931639407","https://openalex.org/W2101105183","https://openalex.org/W2107627409","https://openalex.org/W2153156486","https://openalex.org/W2552161745","https://openalex.org/W2745461083","https://openalex.org/W2886641317","https://openalex.org/W2890531016","https://openalex.org/W2963084599","https://openalex.org/W2964080601","https://openalex.org/W2965597639","https://openalex.org/W2986670728","https://openalex.org/W3034655362","https://openalex.org/W3091588028","https://openalex.org/W3130531605","https://openalex.org/W3167939936","https://openalex.org/W3174377922","https://openalex.org/W3205607545","https://openalex.org/W4295750005","https://openalex.org/W4301156730","https://openalex.org/W4304092583","https://openalex.org/W4313131769"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W4290852288","https://openalex.org/W2949362007","https://openalex.org/W4388893791","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W2949522393"],"abstract_inverted_index":{"Current":[0],"image":[1,96],"captioning":[2],"methods":[3],"commonly":[4],"use":[5],"semantic":[6,108,124],"attributes":[7],"extracted":[8],"by":[9,60],"an":[10],"object":[11],"detector":[12],"to":[13,71,117],"guide":[14],"visual":[15,34,45,105,122],"representation,":[16],"leaving":[17],"the":[18,33,37,48,54,61,73,82,101,141,151,162,170,177],"mutual":[19,88,128],"guidance":[20],"and":[21,25,78,87,107,123,168],"enhancement":[22],"between":[23,76,104],"vision":[24,77],"semantics":[26],"under-explored.":[27],"Neurological":[28],"studies":[29],"have":[30],"revealed":[31],"that":[32,157],"cortex":[35,50],"of":[36,56,84,90,174],"brain":[38],"plays":[39],"a":[40,66,112,127,133],"crucial":[41],"role":[42],"in":[43,53,140],"recognizing":[44],"objects,":[46],"while":[47],"prefrontal":[49],"is":[51,138],"involved":[52],"integration":[55],"contextual":[57],"semantics.":[58],"Inspired":[59],"above":[62],"studies,":[63],"we":[64,110],"propose":[65,111],"novel":[67],"Visual-Semantic":[68],"Transformer":[69],"(VST)":[70],"model":[72],"neural":[74],"interaction":[75],"semantics,":[79],"which":[80,143],"explores":[81],"mechanism":[83],"deep":[85],"fusion":[86,148],"promotion":[89],"multimodal":[91,134,146],"information,":[92],"realizing":[93],"more":[94],"accurate":[95],"captioning.":[97],"To":[98],"better":[99],"facilitate":[100],"complementary":[102],"strengths":[103],"objects":[106],"contexts,":[109],"global":[113],"position-sensitive":[114],"co-attention":[115],"encoder":[116],"realize":[118],"globally":[119],"associative,":[120],"position-aware":[121],"co-interaction":[125],"through":[126],"cross-attention":[129],"mechanism.":[130],"In":[131],"addition,":[132],"mixed":[135],"attention":[136],"module":[137],"proposed":[139],"decoder,":[142],"achieves":[144],"adaptive":[145],"feature":[147],"for":[149],"enhancing":[150],"decoding":[152],"capability.":[153],"Experimental":[154],"evidence":[155],"shows":[156],"our":[158],"VST":[159],"significantly":[160],"surpasses":[161],"state-of-the-art":[163],"approaches":[164],"on":[165,176],"MSCOCO":[166],"dataset":[167],"reaches":[169],"excellent":[171],"CIDEr":[172],"score":[173],"142%":[175],"Karpathy":[178],"test":[179],"split.":[180]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":1}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
