{"id":"https://openalex.org/W4400949041","doi":"https://doi.org/10.1145/3672919.3672964","title":"MGTANet: Multi-Scale Guided Token Attention Network for Image Captioning","display_name":"MGTANet: Multi-Scale Guided Token Attention Network for Image Captioning","publication_year":2024,"publication_date":"2024-03-01","ids":{"openalex":"https://openalex.org/W4400949041","doi":"https://doi.org/10.1145/3672919.3672964"},"language":"en","primary_location":{"id":"doi:10.1145/3672919.3672964","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3672919.3672964","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 3rd International Conference on Cyber Security, Artificial Intelligence and Digital Economy","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103691072","display_name":"Wenhao Jia","orcid":null},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wenhao Jia","raw_affiliation_strings":["School of Computer and Information, Hefei University of Technology, China"],"raw_orcid":"https://orcid.org/0009-0001-2418-4647","affiliations":[{"raw_affiliation_string":"School of Computer and Information, Hefei University of Technology, China","institution_ids":["https://openalex.org/I16365422"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079086221","display_name":"Ronggui Wang","orcid":"https://orcid.org/0009-0007-9204-9379"},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ronggui Wang","raw_affiliation_strings":["School of Computer and Information, Hefei University of Technology, China"],"raw_orcid":"https://orcid.org/0009-0007-7547-5659","affiliations":[{"raw_affiliation_string":"School of Computer and Information, Hefei University of Technology, China","institution_ids":["https://openalex.org/I16365422"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101661856","display_name":"Juan Yang","orcid":"https://orcid.org/0000-0002-9004-3244"},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Juan Yang","raw_affiliation_strings":["School of Computer and Information, Hefei University of Technology, China"],"raw_orcid":"https://orcid.org/0000-0002-9004-3244","affiliations":[{"raw_affiliation_string":"School of Computer and Information, Hefei University of Technology, China","institution_ids":["https://openalex.org/I16365422"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5113302870","display_name":"Lixia Xua","orcid":null},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lixia Xua","raw_affiliation_strings":["School of Computer and Information, Hefei University of Technology, China"],"raw_orcid":"https://orcid.org/0009-0000-4841-5318","affiliations":[{"raw_affiliation_string":"School of Computer and Information, Hefei University of Technology, China","institution_ids":["https://openalex.org/I16365422"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5103691072"],"corresponding_institution_ids":["https://openalex.org/I16365422"],"apc_list":null,"apc_paid":null,"fwci":0.4762,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.62480469,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"237","last_page":"245"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9851999878883362,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9384855031967163},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.780455470085144},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7332351207733154},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5474515557289124},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5309628248214722},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.31049755215644836},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.2433924376964569},{"id":"https://openalex.org/keywords/cartography","display_name":"Cartography","score":0.052306950092315674},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.04785972833633423}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9384855031967163},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.780455470085144},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7332351207733154},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5474515557289124},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5309628248214722},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.31049755215644836},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2433924376964569},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.052306950092315674},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.04785972833633423}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3672919.3672964","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3672919.3672964","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2024 3rd International Conference on Cyber Security, Artificial Intelligence and Digital Economy","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W68733909","https://openalex.org/W639708223","https://openalex.org/W1861492603","https://openalex.org/W1895577753","https://openalex.org/W2185175083","https://openalex.org/W2481240925","https://openalex.org/W2506483933","https://openalex.org/W2741263162","https://openalex.org/W2966350350","https://openalex.org/W3033696290","https://openalex.org/W3035284526","https://openalex.org/W3173961205","https://openalex.org/W3191818374","https://openalex.org/W3203790606","https://openalex.org/W4236965008","https://openalex.org/W4285602612","https://openalex.org/W4286696412","https://openalex.org/W4301045096","https://openalex.org/W4320487288","https://openalex.org/W4389056806","https://openalex.org/W6600020652","https://openalex.org/W6601897980"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W3164229987","https://openalex.org/W3215212336","https://openalex.org/W4290852288","https://openalex.org/W3217388757","https://openalex.org/W3122720459","https://openalex.org/W4298897568","https://openalex.org/W4289422896","https://openalex.org/W1938708284","https://openalex.org/W4380190185"],"abstract_inverted_index":{"Recent":[0],"studies":[1],"have":[2,26,131,166],"shown":[3],"that":[4,174,221],"grid":[5,40,123,143],"features":[6,14,124,136,189],"can":[7,180],"play":[8],"a":[9,48,83,93,104],"similar":[10,223],"role":[11],"to":[12,35,44,55,134,196,231],"region":[13],"in":[15,30,65,99,125,146],"visual-language":[16],"tasks.":[17],"At":[18],"the":[19,36,66,100,116,120,132,142,147,152,156,177,194,198,215],"same":[20],"time,":[21],"transformer":[22],"and":[23,112,190,212],"their":[24],"variants":[25],"demonstrated":[27],"outstanding":[28],"performance":[29],"image":[31,201],"captioning.":[32],"However,":[33],"due":[34],"fine-grained":[37],"nature":[38],"of":[39,68,122,155,200],"features,":[41,53],"self-attention":[42,96,148],"tends":[43],"overly":[45],"concentrate":[46],"on":[47,176,214],"very":[49],"few":[50],"adjacent":[51],"visual":[52,61,184],"failing":[54],"establish":[56],"effective":[57],"connections":[58],"with":[59,186],"global":[60,162,183],"features.":[62],"This":[63],"results":[64,219],"loss":[67],"object":[69],"context":[70],"information,":[71],"thereby":[72,150],"impairing":[73],"model":[74,157],"performance.":[75],"To":[76,203],"address":[77],"this":[78,80],"issue,":[79],"paper":[81],"proposes":[82],"Multi-Scale":[84],"Guided":[85],"Token":[86],"Attention":[87,171],"Network":[88],"(MGTANet).":[89],"Specifically,":[90],"we":[91,108,165,208],"introduce":[92],"guided":[94,129],"token":[95],"(GTSA)":[97],"mechanism":[98,173],"encoder.":[101],"By":[102],"employing":[103],"mixed-scale":[105],"extraction":[106],"method,":[107],"generate":[109],"\"guided":[110],"tokens\"":[111],"feed":[113],"them":[114,192],"into":[115,193],"self-attention,":[117],"thus":[118],"reducing":[119,141,151],"number":[121],"subsequent":[126],"processing.":[127],"These":[128],"tokens":[130],"ability":[133],"capture":[135],"at":[137],"different":[138],"scales,":[139],"effectively":[140],"quantity":[144],"required":[145],"process,":[149],"computational":[153],"complexity":[154],"while":[158],"still":[159],"preserving":[160],"crucial":[161],"information.":[163],"Additionally,":[164],"developed":[167],"an":[168],"Adaptive":[169],"Gated":[170],"(AGA)":[172],"operates":[175],"decoder.":[178],"It":[179],"flexibly":[181],"combine":[182],"information":[185],"local":[187],"detail":[188],"integrate":[191],"decoder":[195],"guide":[197],"generation":[199],"captions.":[202],"substantiate":[204],"our":[205],"model's":[206],"effectiveness,":[207],"conducted":[209],"thorough":[210],"experiments":[211],"visualizations":[213],"MS-COCO":[216],"dataset.":[217],"The":[218],"illustrate":[220],"under":[222],"experimental":[224],"conditions,":[225],"MGTANet":[226],"yields":[227],"competitive":[228],"quality":[229],"compared":[230],"existing":[232],"models.":[233]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-12-26T23:08:49.675405","created_date":"2025-10-10T00:00:00"}
