{"id":"https://openalex.org/W4409153000","doi":"https://doi.org/10.1109/tcsvt.2025.3558088","title":"Fully Semantic Gap Recovery for End-to-End Image Captioning","display_name":"Fully Semantic Gap Recovery for End-to-End Image Captioning","publication_year":2025,"publication_date":"2025-04-04","ids":{"openalex":"https://openalex.org/W4409153000","doi":"https://doi.org/10.1109/tcsvt.2025.3558088"},"language":"en","primary_location":{"id":"doi:10.1109/tcsvt.2025.3558088","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3558088","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111488600","display_name":"J. Gao","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jingchun Gao","raw_affiliation_strings":["School of Cyberspace Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0009-0005-5273-8212","affiliations":[{"raw_affiliation_string":"School of Cyberspace Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100616189","display_name":"Lei Zhang","orcid":"https://orcid.org/0000-0002-2839-8693"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Zhang","raw_affiliation_strings":["School of Information Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-2839-8693","affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107953382","display_name":"Jingyu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jingyu Li","raw_affiliation_strings":["Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-9561-7550","affiliations":[{"raw_affiliation_string":"Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5023341829","display_name":"Zhendong Mao","orcid":"https://orcid.org/0000-0001-5739-8126"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhendong Mao","raw_affiliation_strings":["School of Cyberspace Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0001-5739-8126","affiliations":[{"raw_affiliation_string":"School of Cyberspace Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5111488600"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0488091,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"35","issue":"9","first_page":"9365","last_page":"9383"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9872000217437744,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9718999862670898,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.7703546285629272},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6920323371887207},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.6643753051757812},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.49318045377731323},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.41792163252830505},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3964439332485199}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.7703546285629272},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6920323371887207},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.6643753051757812},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.49318045377731323},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.41792163252830505},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3964439332485199}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcsvt.2025.3558088","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3558088","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G7854910236","display_name":null,"funder_award_id":"62222212","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8003946219","display_name":null,"funder_award_id":"62336001","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":61,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W1905882502","https://openalex.org/W1956340063","https://openalex.org/W2053565832","https://openalex.org/W2069303182","https://openalex.org/W2101105183","https://openalex.org/W2125215748","https://openalex.org/W2194775991","https://openalex.org/W2506483933","https://openalex.org/W2561196672","https://openalex.org/W2745461083","https://openalex.org/W2795151422","https://openalex.org/W2890531016","https://openalex.org/W2904565150","https://openalex.org/W2963084599","https://openalex.org/W2963101956","https://openalex.org/W2966162142","https://openalex.org/W2981165461","https://openalex.org/W2986670728","https://openalex.org/W3034655362","https://openalex.org/W3035284526","https://openalex.org/W3091588028","https://openalex.org/W3096609285","https://openalex.org/W3136792391","https://openalex.org/W3138516171","https://openalex.org/W3153469116","https://openalex.org/W3167939936","https://openalex.org/W3173220247","https://openalex.org/W3174377922","https://openalex.org/W3195680250","https://openalex.org/W3205765769","https://openalex.org/W4221147537","https://openalex.org/W4282968790","https://openalex.org/W4283271696","https://openalex.org/W4285186657","https://openalex.org/W4285197287","https://openalex.org/W4304092583","https://openalex.org/W4312289196","https://openalex.org/W4312344082","https://openalex.org/W4312563428","https://openalex.org/W4312651322","https://openalex.org/W4312922092","https://openalex.org/W4312924260","https://openalex.org/W4312960937","https://openalex.org/W4313131769","https://openalex.org/W4319777846","https://openalex.org/W4320487288","https://openalex.org/W4381715078","https://openalex.org/W4385245566","https://openalex.org/W4386072307","https://openalex.org/W4386076119","https://openalex.org/W4387968721","https://openalex.org/W4388936587","https://openalex.org/W4389776363","https://openalex.org/W4390873264","https://openalex.org/W4394913412","https://openalex.org/W4400447352","https://openalex.org/W4402570242","https://openalex.org/W4402727764","https://openalex.org/W4402916530","https://openalex.org/W4404544982"],"related_works":["https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Image":[0],"captioning":[1,27],"(IC)":[2],"involves":[3],"the":[4,9,22,62,68,82,95,106,113,132,145,161,192,206,211,221,230,237,255,265],"comprehension":[5],"of":[6,165,232],"images":[7],"from":[8,61,67,210],"visual":[10,19,34,41,73,102,116,128,140,179,195,208,214],"domain":[11],"to":[12,39,50,81,94,100,105,122,159,190,204],"generate":[13],"descriptions":[14],"that":[15,246],"are":[16],"grounded":[17],"in":[18,138,144],"elements":[20],"within":[21],"linguistic":[23,77,227],"domain.":[24],"Current":[25],"image":[26,222],"methods":[28,45],"typically":[29],"rely":[30],"on":[31,236,242,254,264],"pre-trained":[32,72,87],"unimodal":[33,47,118],"backbones":[35],"or":[36],"vision-language":[37,88,173],"models":[38],"identify":[40],"entities.":[42],"Subsequently,":[43],"these":[44],"employ":[46],"self-attention":[48,119],"fusion":[49,120],"uncover":[51,57],"high-level":[52],"semantic":[53,65,78,125,133,174,201],"associations.":[54],"However,":[55],"we":[56,149,182,198],"this":[58],"paradigm":[59],"suffers":[60],"inherent":[63],"intra-modal":[64],"gap":[66,136],"input":[69],"features.":[70,215,239],"Unimodal":[71],"features":[74,103],"lack":[75],"sufficient":[76],"information":[79],"due":[80],"modality":[83],"misalignment.":[84],"Furthermore,":[85],"contrastive":[86,187],"models,":[89],"such":[90],"as":[91],"CLIP,":[92],"confine":[93],"global":[96],"cross-modal":[97,163,218],"alignment,":[98],"leading":[99],"local":[101,186,213],"belonging":[104],"same":[107],"object":[108],"exhibiting":[109,260],"distinct":[110],"semantics.":[111],"Given":[112],"semantically":[114,193],"insufficient":[115],"features,":[117],"struggles":[121],"accurately":[123],"capture":[124],"associations":[126,143,175],"among":[127],"patches,":[129],"thereby":[130],"exacerbating":[131],"gap.":[134],"This":[135],"results":[137],"inaccurate":[139],"entities":[141],"and":[142,171],"generated":[146],"captions.":[147],"Therefore,":[148],"propose":[150,184],"a":[151,168,185,200],"novel":[152],"Fully":[153],"Semantic":[154],"Gap":[155],"Recovery":[156],"(FSGR)":[157],"method":[158,189],"broaden":[160],"robust":[162],"bridge":[164],"CLIP":[166],"into":[167],"fine-grained":[169,217],"level":[170],"consolidate":[172],"for":[176],"more":[177],"precise":[178],"comprehension.":[180],"Technically,":[181],"first":[183],"learning":[188],"aggregate":[191],"similar":[194],"patches.":[196],"Next,":[197],"design":[199],"quantification":[202],"module":[203],"abstract":[205],"language-bridged":[207],"map":[209],"enhanced":[212],"Finally,":[216],"interaction":[219],"consolidates":[220],"patches":[223],"with":[224],"their":[225],"corresponding":[226],"semantics,":[228],"allowing":[229],"generation":[231],"plausible":[233],"captions":[234],"based":[235],"aggregated":[238],"Extensive":[240],"experiments":[241],"comprehensive":[243],"metrics":[244],"demonstrate":[245],"our":[247],"model":[248],"has":[249],"achieved":[250],"new":[251],"state-of-the-art":[252],"performance":[253],"MSCOCO":[256],"dataset,":[257],"while":[258],"also":[259],"competitive":[261],"cross-domain":[262],"capability":[263],"Nocaps":[266],"dataset.":[267],"Source":[268],"code":[269],"released":[270],"at":[271],"https://github.com/gjc0824/FSGR.":[272]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
