{"id":"https://openalex.org/W4387968008","doi":"https://doi.org/10.1145/3581783.3612571","title":"Zero-TextCap: Zero-shot Framework for Text-based Image Captioning","display_name":"Zero-TextCap: Zero-shot Framework for Text-based Image Captioning","publication_year":2023,"publication_date":"2023-10-26","ids":{"openalex":"https://openalex.org/W4387968008","doi":"https://doi.org/10.1145/3581783.3612571"},"language":"en","primary_location":{"id":"doi:10.1145/3581783.3612571","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612571","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103177231","display_name":"Dongsheng Xu","orcid":"https://orcid.org/0009-0003-2481-5669"},"institutions":[{"id":"https://openalex.org/I150807315","display_name":"Guangxi University","ror":"https://ror.org/02c9qn167","country_code":"CN","type":"education","lineage":["https://openalex.org/I150807315"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Dongsheng Xu","raw_affiliation_strings":["Guangxi University, Nanning, China"],"affiliations":[{"raw_affiliation_string":"Guangxi University, Nanning, China","institution_ids":["https://openalex.org/I150807315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015766803","display_name":"Wenye Zhao","orcid":"https://orcid.org/0009-0004-5482-6753"},"institutions":[{"id":"https://openalex.org/I150807315","display_name":"Guangxi University","ror":"https://ror.org/02c9qn167","country_code":"CN","type":"education","lineage":["https://openalex.org/I150807315"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenye Zhao","raw_affiliation_strings":["Guangxi University, Nanning, China"],"affiliations":[{"raw_affiliation_string":"Guangxi University, Nanning, China","institution_ids":["https://openalex.org/I150807315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089123257","display_name":"Yi Cai","orcid":"https://orcid.org/0000-0002-1767-789X"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Cai","raw_affiliation_strings":["South China University of Technology &amp; MOE of China, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"South China University of Technology &amp; MOE of China, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5061381102","display_name":"Qingbao Huang","orcid":"https://orcid.org/0000-0001-7691-347X"},"institutions":[{"id":"https://openalex.org/I150807315","display_name":"Guangxi University","ror":"https://ror.org/02c9qn167","country_code":"CN","type":"education","lineage":["https://openalex.org/I150807315"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qingbao Huang","raw_affiliation_strings":["Guangxi University &amp; Guangxi Key Laboratory of Multimedia Communications and Network Technology, Nanning, China"],"affiliations":[{"raw_affiliation_string":"Guangxi University &amp; Guangxi Key Laboratory of Multimedia Communications and Network Technology, Nanning, China","institution_ids":["https://openalex.org/I150807315"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5103177231"],"corresponding_institution_ids":["https://openalex.org/I150807315"],"apc_list":null,"apc_paid":null,"fwci":0.3576,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.60514674,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"4949","last_page":"4957"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9872999787330627,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9221897721290588},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8116604089736938},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5788244009017944},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5278981328010559},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.5097400546073914},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.43865275382995605},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.4318165183067322},{"id":"https://openalex.org/keywords/boosting","display_name":"Boosting (machine learning)","score":0.4136110842227936},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.34422212839126587},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.341325581073761},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.26222848892211914},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.15698710083961487}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9221897721290588},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8116604089736938},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5788244009017944},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5278981328010559},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.5097400546073914},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.43865275382995605},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.4318165183067322},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.4136110842227936},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.34422212839126587},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.341325581073761},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.26222848892211914},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.15698710083961487},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3581783.3612571","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612571","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.8100000023841858,"id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G1380176761","display_name":null,"funder_award_id":"2020B0101100002","funder_id":"https://openalex.org/F4320323059","funder_display_name":"South China University of Technology"},{"id":"https://openalex.org/G1473307615","display_name":null,"funder_award_id":"2022GXNSFAA035627","funder_id":"https://openalex.org/F4320322768","funder_display_name":"Natural Science Foundation of Guangxi Province"},{"id":"https://openalex.org/G1477544716","display_name":null,"funder_award_id":"Guangdong","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2376276132","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"},{"id":"https://openalex.org/G2799848387","display_name":null,"funder_award_id":"62076100","funder_id":"https://openalex.org/F4320335795","funder_display_name":"Science and Technology Planning Project of Guangdong Province"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G391238517","display_name":null,"funder_award_id":", and","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4572242254","display_name":null,"funder_award_id":"2020B0101100002","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"},{"id":"https://openalex.org/G4913122426","display_name":null,"funder_award_id":"2020B0101100002","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5760752404","display_name":null,"funder_award_id":"Projects","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5939423041","display_name":null,"funder_award_id":"Technology","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6268037997","display_name":null,"funder_award_id":"62276072, 62076100, and 62261003","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6770297356","display_name":null,"funder_award_id":"62276072","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7020966827","display_name":null,"funder_award_id":"62076100","funder_id":"https://openalex.org/F4320323059","funder_display_name":"South China University of Technology"},{"id":"https://openalex.org/G7071335933","display_name":null,"funder_award_id":"62261003","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7608752429","display_name":null,"funder_award_id":"Talent","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8189774080","display_name":null,"funder_award_id":"62076100","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8586639432","display_name":null,"funder_award_id":"62076100","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"},{"id":"https://openalex.org/G8718620462","display_name":null,"funder_award_id":"2020B0101100002","funder_id":"https://openalex.org/F4320335795","funder_display_name":"Science and Technology Planning Project of Guangdong Province"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322768","display_name":"Natural Science Foundation of Guangxi Province","ror":null},{"id":"https://openalex.org/F4320323059","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null},{"id":"https://openalex.org/F4320335795","display_name":"Science and Technology Planning Project of Guangdong Province","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1593271688","https://openalex.org/W1861492603","https://openalex.org/W1956340063","https://openalex.org/W2550687635","https://openalex.org/W2600463316","https://openalex.org/W2745461083","https://openalex.org/W2809273748","https://openalex.org/W2954841306","https://openalex.org/W2963131369","https://openalex.org/W2986670728","https://openalex.org/W3034655362","https://openalex.org/W3093272873","https://openalex.org/W3106859150","https://openalex.org/W3110661548","https://openalex.org/W3112075608","https://openalex.org/W3153469116","https://openalex.org/W3173220247","https://openalex.org/W3174012740","https://openalex.org/W3177430964","https://openalex.org/W3181159501","https://openalex.org/W3213454282","https://openalex.org/W4200635486","https://openalex.org/W4220798700","https://openalex.org/W4283399970","https://openalex.org/W4285191490","https://openalex.org/W4304086156","https://openalex.org/W4312660844","https://openalex.org/W4312938727","https://openalex.org/W4313131769","https://openalex.org/W4382459038","https://openalex.org/W4386075661"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W4290852288","https://openalex.org/W2949362007","https://openalex.org/W4388893791","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W2949522393"],"abstract_inverted_index":{"Text-based":[0,109],"image":[1],"captioning":[2],"is":[3,93,196],"a":[4,105,138,157,181],"vital":[5],"but":[6,26],"under-explored":[7],"task,":[8],"which":[9,47,71],"aims":[10],"to":[11,63,114,128,176,186],"describe":[12,64],"images":[13,65],"by":[14,167],"captions":[15,165,200],"containing":[16,201],"scene":[17,41,147,170,203],"text":[18,42,148,171,204],"automatically.":[19],"Recent":[20],"studies":[21],"have":[22],"made":[23],"encouraging":[24],"progress,":[25],"they":[27],"are":[28,173,222],"still":[29],"suffering":[30],"from":[31,119],"two":[32],"issues.":[33],"Firstly,":[34],"current":[35,60],"models":[36,61],"cannot":[37],"capture":[38],"and":[39,52,68,96,124,132,149,172,190,205],"generate":[40,115],"in":[43],"non-Latin":[44],"script":[45],"languages,":[46],"severely":[48],"limits":[49,73],"the":[50,53,74,77,81,100,120,130,151,164,177,207,214],"objectivity":[51],"information":[54],"completeness":[55],"of":[56,76,134,198,209,216],"generated":[57,78,166],"captions.":[58,79,210],"Secondly,":[59],"tend":[62],"with":[66],"monotonous":[67],"templated":[69],"style,":[70],"greatly":[72],"diversity":[75,133,208],"Although":[80],"above-mentioned":[82],"issues":[83],"can":[84],"be":[85],"alleviated":[86],"through":[87],"carefully":[88],"designed":[89],"annotations,":[90],"this":[91],"process":[92],"undoubtedly":[94],"laborious":[95],"time-consuming.":[97],"To":[98,144,161],"address":[99],"above":[101],"issues,":[102],"we":[103,136,155,179],"propose":[104,180],"Zero-shot":[106],"Framework":[107],"for":[108],"Image":[110],"Captioning":[111],"(Zero-TextCap).":[112],"Concretely,":[113],"candidate":[116,192],"sentences":[117],"starting":[118],"prompt":[121],"'Image":[122],"of'":[123],"iteratively":[125],"refine":[126],"them":[127],"improve":[129],"quality":[131],"captions,":[135],"introduce":[137,156],"Hybrid-sampling":[139],"masked":[140],"language":[141],"model":[142,150],"(H-MLM).":[143],"read":[145],"multi-lingual":[146,202],"relationships":[152],"between":[153],"them,":[154],"robust":[158],"OCR":[159,188],"system.":[160],"ensure":[162],"that":[163],"H-MLM":[168],"contain":[169],"highly":[174],"relevant":[175],"image,":[178],"CLIP-based":[182],"generation":[183],"guidance":[184],"module":[185],"insert":[187],"tokens":[189],"filter":[191],"sentences.":[193],"Our":[194,220],"Zero-TextCap":[195],"capable":[197],"generalizing":[199],"boosting":[206],"Sufficient":[211],"experiments":[212],"demonstrate":[213],"effectiveness":[215],"our":[217],"proposed":[218],"Zero-TextCap.":[219],"codes":[221],"available":[223],"at":[224],"https://github.com/Gemhuang79/Zero_TextCap.":[225]},"counts_by_year":[{"year":2024,"cited_by_count":3}],"updated_date":"2026-04-13T07:58:08.660418","created_date":"2025-10-10T00:00:00"}
