{"id":"https://openalex.org/W4390195594","doi":"https://doi.org/10.1145/3638558","title":"Exploring Visual Relationships via Transformer-based Graphs for Enhanced Image Captioning","display_name":"Exploring Visual Relationships via Transformer-based Graphs for Enhanced Image Captioning","publication_year":2023,"publication_date":"2023-12-25","ids":{"openalex":"https://openalex.org/W4390195594","doi":"https://doi.org/10.1145/3638558"},"language":"en","primary_location":{"id":"doi:10.1145/3638558","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3638558","pdf_url":null,"source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Jingyu Li","orcid":"https://orcid.org/0000-0002-9561-7550"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jingyu Li","raw_affiliation_strings":["University of Science and Technology of China, China"],"raw_orcid":"https://orcid.org/0000-0002-9561-7550","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023341829","display_name":"Zhendong Mao","orcid":"https://orcid.org/0000-0001-5739-8126"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhendong Mao","raw_affiliation_strings":["University of Science and Technology of China and the Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, China"],"raw_orcid":"https://orcid.org/0000-0001-5739-8126","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China and the Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5105008420","display_name":"Hao Li","orcid":"https://orcid.org/0009-0003-9621-0925"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Li","raw_affiliation_strings":["University of Science and Technology of China, China"],"raw_orcid":"https://orcid.org/0009-0003-9621-0925","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100357392","display_name":"Weidong Chen","orcid":"https://orcid.org/0000-0003-2774-2875"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weidong Chen","raw_affiliation_strings":["University of Science and Technology of China, China"],"raw_orcid":"https://orcid.org/0000-0003-2774-2875","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5046305086","display_name":"Yongdong Zhang","orcid":"https://orcid.org/0000-0002-1151-1792"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yongdong Zhang","raw_affiliation_strings":["University of Science and Technology of China and the Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, China"],"raw_orcid":"https://orcid.org/0000-0002-1151-1792","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China and the Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":2.1628,"has_fulltext":false,"cited_by_count":19,"citation_normalized_percentile":{"value":0.89914882,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":"20","issue":"5","first_page":"1","last_page":"23"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9904000163078308,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9894000291824341,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.664943277835846},{"id":"https://openalex.org/keywords/spatial-relation","display_name":"Spatial relation","score":0.605446457862854},{"id":"https://openalex.org/keywords/scene-graph","display_name":"Scene graph","score":0.566423773765564},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.48435577750205994},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.4464704394340515},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.4311380982398987},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.41639333963394165}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.664943277835846},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.605446457862854},{"id":"https://openalex.org/C179372163","wikidata":"https://www.wikidata.org/wiki/Q1406181","display_name":"Scene graph","level":3,"score":0.566423773765564},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48435577750205994},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.4464704394340515},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.4311380982398987},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.41639333963394165},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3638558","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3638558","pdf_url":null,"source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1565499372","display_name":null,"funder_award_id":"62121002","funder_id":"https://openalex.org/F4320322271","funder_display_name":"Science Fund for Creative Research Groups"},{"id":"https://openalex.org/G3828173677","display_name":null,"funder_award_id":"62121002","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5609620912","display_name":null,"funder_award_id":"62302474","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7854910236","display_name":null,"funder_award_id":"62222212","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8374609628","display_name":null,"funder_award_id":"U19A2057","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322271","display_name":"Science Fund for Creative Research Groups","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":55,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W1593271688","https://openalex.org/W1861492603","https://openalex.org/W1956340063","https://openalex.org/W2277195237","https://openalex.org/W2322020277","https://openalex.org/W2506483933","https://openalex.org/W2531563875","https://openalex.org/W2549139847","https://openalex.org/W2558687840","https://openalex.org/W2575842049","https://openalex.org/W2600463316","https://openalex.org/W2612690371","https://openalex.org/W2613718673","https://openalex.org/W2745461083","https://openalex.org/W2890531016","https://openalex.org/W2950626540","https://openalex.org/W2963084599","https://openalex.org/W2963101956","https://openalex.org/W2965597639","https://openalex.org/W2966162142","https://openalex.org/W2970569830","https://openalex.org/W2973642807","https://openalex.org/W2981165461","https://openalex.org/W2983141445","https://openalex.org/W2986670728","https://openalex.org/W2998665041","https://openalex.org/W3034655362","https://openalex.org/W3034733309","https://openalex.org/W3035160838","https://openalex.org/W3035284526","https://openalex.org/W3035497460","https://openalex.org/W3103651098","https://openalex.org/W3111947517","https://openalex.org/W3119381934","https://openalex.org/W3136792391","https://openalex.org/W3143835353","https://openalex.org/W3167939936","https://openalex.org/W3175824375","https://openalex.org/W3195680250","https://openalex.org/W3205607545","https://openalex.org/W3205765769","https://openalex.org/W3211865849","https://openalex.org/W4205474609","https://openalex.org/W4206314411","https://openalex.org/W4213031069","https://openalex.org/W4283271696","https://openalex.org/W4285186657","https://openalex.org/W4285197287","https://openalex.org/W4285389151","https://openalex.org/W4288329833","https://openalex.org/W4312761738","https://openalex.org/W4379780990","https://openalex.org/W4379983266","https://openalex.org/W4386065291"],"related_works":["https://openalex.org/W2787993192","https://openalex.org/W4214591816","https://openalex.org/W4389471147","https://openalex.org/W4312650216","https://openalex.org/W3042849301","https://openalex.org/W3109142545","https://openalex.org/W4221155265","https://openalex.org/W4385571298","https://openalex.org/W2590022237","https://openalex.org/W4390195594"],"abstract_inverted_index":{"Image":[0],"captioning":[1],"(IC),":[2],"bringing":[3],"vision":[4],"to":[5,67,79,82,96,114,161,197,226,247],"language,":[6],"has":[7],"drawn":[8],"extensive":[9],"attention.":[10],"A":[11],"crucial":[12],"aspect":[13],"of":[14,20,71,135,177,186,238,243],"IC":[15],"is":[16,94],"the":[17,57,68,84,90,116,133,157,163,174,178,184,213,220,236],"accurate":[18,228],"depiction":[19],"visual":[21,85],"relations":[22,27,33,65,123,208],"among":[23],"image":[24,187,199],"objects.":[25,61],"Visual":[26],"encompass":[28],"two":[29],"primary":[30],"facets:":[31],"content":[32,43,51,93,100,122,164],"and":[34,46,48,54,102,126,129,152,180,206],"structural":[35,64,103,130,188,207],"relations.":[36,105,165,189],"Content":[37],"relations,":[38,86,131],"which":[39],"comprise":[40],"geometric":[41,80,92,125,146,179],"positions":[42,81],"(i.e.,":[44,52,124],"distances":[45],"sizes)":[47],"semantic":[49,127,154,181,221],"interactions":[50],"actions":[53],"possessives),":[55],"unveil":[56],"mutual":[58],"correlations":[59,101,117],"between":[60,118],"In":[62,106],"contrast,":[63],"pertain":[66],"topological":[69],"connectivity":[70],"object":[72],"regions.":[73],"Existing":[74],"Transformer-based":[75],"methods":[76],"typically":[77],"resort":[78],"enhance":[83],"yet":[87],"only":[88],"using":[89],"shallow":[91],"unable":[95],"precisely":[97],"cover":[98],"actional":[99],"connection":[104],"this":[107],"article,":[108],"we":[109,143,167,191,217],"adopt":[110],"a":[111,145,153,169,193],"comprehensive":[112],"perspective":[113],"examine":[115],"objects,":[119],"incorporating":[120],"both":[121],"relations)":[128],"with":[132,241],"aim":[134],"generating":[136],"plausible":[137],"captions.":[138],"To":[139],"achieve":[140],"this,":[141],"first,":[142],"construct":[144,168],"graph":[147,155,159,171],"from":[148,156,245],"bounding":[149],"box":[150],"features":[151],"scene":[158],"parser":[160],"model":[162],"Innovatively,":[166],"topology":[170],"that":[172],"amalgamates":[173],"sparsity":[175],"characteristics":[176],"graphs,":[182],"enabling":[183],"representation":[185],"Second,":[190],"propose":[192],"novel":[194],"unified":[195],"approach":[196],"enrich":[198],"relation":[200,222],"representations":[201],"by":[202],"integrating":[203],"semantic,":[204],"geometric,":[205],"into":[209],"self-attention.":[210],"Finally,":[211],"in":[212],"language":[214],"decoding":[215],"stage,":[216],"further":[218],"leverage":[219],"as":[223],"prior":[224],"knowledge":[225],"generate":[227],"words.":[229],"Extensive":[230],"experiments":[231],"on":[232],"MS-COCO":[233],"dataset":[234],"demonstrate":[235],"effectiveness":[237],"our":[239],"model,":[240],"improvements":[242],"CIDEr":[244],"128.6%":[246],"136.6%.":[248],"Codes":[249],"have":[250],"been":[251],"released":[252],"at":[253],"https://github.com/CrossmodalGroup/ER-SAN/tree/main/VG-Cap":[254],".":[255]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":12},{"year":2024,"cited_by_count":5}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
