{"id":"https://openalex.org/W7131634831","doi":"https://doi.org/10.48550/arxiv.2602.21655","title":"CCCaption: Dual-Reward Reinforcement Learning for Complete and Correct Image Captioning","display_name":"CCCaption: Dual-Reward Reinforcement Learning for Complete and Correct Image Captioning","publication_year":2026,"publication_date":"2026-02-25","ids":{"openalex":"https://openalex.org/W7131634831","doi":"https://doi.org/10.48550/arxiv.2602.21655"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.21655","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126941587","display_name":"Zhijiang Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Tang, Zhijiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126927377","display_name":"Linhua Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Linhua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126911046","display_name":"Jiaxin Qi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qi, Jiaxin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126918051","display_name":"Weihao Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Weihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025853900","display_name":"Peng Hou","orcid":"https://orcid.org/0000-0001-5101-1496"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hou, Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126925090","display_name":"Anxiang Zeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Anxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126873002","display_name":"Jianqiang Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Jianqiang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5126941587"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.0005000000237487257,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.00039999998989515007,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9563000202178955},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.7153000235557556},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.661899983882904},{"id":"https://openalex.org/keywords/completeness","display_name":"Completeness (order theory)","score":0.6403999924659729},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.6079999804496765},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5812000036239624},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4896000027656555}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9563000202178955},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7986000180244446},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.7153000235557556},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.661899983882904},{"id":"https://openalex.org/C17231256","wikidata":"https://www.wikidata.org/wiki/Q5156540","display_name":"Completeness (order theory)","level":2,"score":0.6403999924659729},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6150000095367432},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.6079999804496765},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5812000036239624},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.490200012922287},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4896000027656555},{"id":"https://openalex.org/C2780428219","wikidata":"https://www.wikidata.org/wiki/Q16952335","display_name":"Cover (algebra)","level":2,"score":0.4156999886035919},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.38989999890327454},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.37470000982284546},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3619000017642975},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3546999990940094},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.31439998745918274},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3021000027656555},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2879999876022339},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.27970001101493835},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.26600000262260437}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.21655","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.21655","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.21655","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.21655","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7001770734786987}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Image":[0],"captioning":[1,180],"remains":[2],"a":[3,79,85,111,126,186],"fundamental":[4],"task":[5],"for":[6],"vision":[7],"language":[8],"understanding,":[9],"yet":[10],"ground-truth":[11,27],"supervision":[12],"still":[13],"relies":[14],"predominantly":[15],"on":[16],"human-annotated":[17],"references.":[18],"Because":[19],"human":[20],"annotations":[21],"reflect":[22],"subjective":[23],"preferences":[24],"and":[25,62,97,116,164],"expertise,":[26],"captions":[28,118,139,169],"are":[29,151],"often":[30],"incomplete":[31],"or":[32],"even":[33],"incorrect,":[34],"which":[35,150],"in":[36],"turn":[37],"limits":[38],"caption":[39,44,56,155,191],"models.":[40],"We":[41],"argue":[42],"that":[43,89,119,140,170],"quality":[45],"should":[46],"be":[47],"assessed":[48],"by":[49,143],"two":[50],"objective":[51,174],"aspects:":[52],"completeness":[53,163],"(does":[54],"the":[55,65,71,108,145,154],"cover":[57],"all":[58],"salient":[59],"visual":[60,114],"facts?)":[61],"correctness":[63],"(are":[64],"descriptions":[66],"true":[67],"with":[68,84,125],"respect":[69],"to":[70,94,106,131,189],"image?).":[72],"To":[73],"this":[74],"end,":[75],"we":[76,102,137],"introduce":[77],"CCCaption:":[78],"dual-reward":[80,159],"reinforcement":[81],"learning":[82],"framework":[83],"dedicated":[86],"fine-tuning":[87],"corpus":[88],"explicitly":[90],"optimizes":[91],"these":[92,123,173],"properties":[93],"generate":[95],"\\textbf{C}omplete":[96],"\\textbf{C}orrect":[98],"\\textbf{Captions}.":[99],"For":[100,135],"completeness,":[101],"use":[103],"diverse":[104],"LVLMs":[105],"disentangle":[107],"image":[109],"into":[110],"set":[112],"of":[113,122,147],"queries,":[115,124,149],"reward":[117],"answer":[120],"more":[121],"dynamic":[127],"query":[128],"sampling":[129],"strategy":[130],"improve":[132],"training":[133,190],"efficiency.":[134],"correctness,":[136,165],"penalize":[138],"contain":[141],"hallucinations":[142],"validating":[144],"authenticity":[146],"sub-caption":[148],"derived":[152],"from":[153],"decomposition.":[156],"Our":[157],"symmetric":[158],"optimization":[160],"jointly":[161],"maximizes":[162],"guiding":[166],"models":[167,192],"toward":[168],"better":[171],"satisfy":[172],"criteria.":[175],"Extensive":[176],"experiments":[177],"across":[178],"standard":[179],"benchmarks":[181],"show":[182],"consistent":[183],"improvements,":[184],"offering":[185],"principled":[187],"path":[188],"beyond":[193],"human-annotation":[194],"imitation.":[195]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-27T00:00:00"}
