{"id":"https://openalex.org/W7160537945","doi":"https://doi.org/10.48550/arxiv.2605.04503","title":"DiffCap-Bench: A Comprehensive, Challenging, Robust Benchmark for Image Difference Captioning","display_name":"DiffCap-Bench: A Comprehensive, Challenging, Robust Benchmark for Image Difference Captioning","publication_year":2026,"publication_date":"2026-05-06","ids":{"openalex":"https://openalex.org/W7160537945","doi":"https://doi.org/10.48550/arxiv.2605.04503"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.04503","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.04503","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135634969","display_name":"Yuancheng Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Yuancheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135622012","display_name":"Haojie Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Haojie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135635337","display_name":"Linli Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Linli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135566646","display_name":"Lei Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Lei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135578604","display_name":"Jiali Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jiali","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135592445","display_name":"Tao Huang","orcid":"https://orcid.org/0009-0004-1089-0403"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Tao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135631062","display_name":"Yiting Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Yiting","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111028333","display_name":"Duojun Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Duojun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135595991","display_name":"Xin Li","orcid":"https://orcid.org/0000-0001-7733-439X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135592883","display_name":"Zhao Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Zhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9865000247955322,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9865000247955322,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.0020000000949949026,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.0013000000035390258,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.8364999890327454},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.6887000203132629},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5823000073432922},{"id":"https://openalex.org/keywords/protocol","display_name":"Protocol (science)","score":0.4706999957561493},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.46720001101493835},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.3711000084877014},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3547999858856201}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.8364999890327454},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7452999949455261},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.6887000203132629},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5823000073432922},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5335999727249146},{"id":"https://openalex.org/C2780385302","wikidata":"https://www.wikidata.org/wiki/Q367158","display_name":"Protocol (science)","level":3,"score":0.4706999957561493},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.46720001101493835},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4361000061035156},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.38989999890327454},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3711000084877014},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3547999858856201},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3434000015258789},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3310000002384186},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.32989999651908875},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3197000026702881},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.29330000281333923},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.28679999709129333},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2718000113964081},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.2551000118255615},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.25220000743865967},{"id":"https://openalex.org/C3018023364","wikidata":"https://www.wikidata.org/wiki/Q425265","display_name":"Significant difference","level":2,"score":0.2515999972820282}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.04503","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.04503","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04503","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Image":[0],"Difference":[1,102],"Captioning":[2],"(IDC)":[3],"generates":[4],"natural":[5],"language":[6,65],"descriptions":[7],"that":[8],"precisely":[9],"identify":[10,142],"differences":[11],"between":[12,129],"two":[13],"images,":[14],"serving":[15],"as":[16,172],"a":[17,57,77,105,174,180],"key":[18],"benchmark":[19,80,187],"for":[20],"fine-grained":[21],"change":[22],"perception,":[23],"cross-modal":[24],"reasoning,":[25],"and":[26,36,39,59,89,114,131,141,158,179,188],"image":[27,163],"editing":[28,164],"data":[29,165],"construction.":[30],"However,":[31],"existing":[32],"benchmarks":[33],"lack":[34],"diversity":[35,88],"compositional":[37,90],"complexity,":[38],"standard":[40],"lexical-overlap":[41],"metrics":[42],"(e.g.,":[43],"BLEU,":[44],"METEOR)":[45],"fail":[46],"to":[47,86,111,195],"capture":[48,113],"semantic":[49],"consistency":[50],"or":[51],"penalize":[52],"hallucinations,":[53],"which":[54],"together":[55],"prevent":[56],"comprehensive":[58,78],"robust":[60,106],"evaluation":[61,97,120,177],"of":[62,108,121,138,183],"multimodal":[63],"large":[64],"models":[66],"(MLLMs)":[67],"on":[68],"IDC.":[69],"To":[70],"address":[71],"these":[72],"gaps,":[73],"we":[74,93,124],"introduce":[75],"DiffCap-Bench,":[76],"IDC":[79,176],"covering":[81],"ten":[82],"distinct":[83],"difference":[84],"categories":[85],"ensure":[87],"complexity.":[91],"Furthermore,":[92],"propose":[94],"an":[95],"LLM-as-a-Judge":[96],"protocol":[98],"grounded":[99],"in":[100,145],"human-validated":[101],"Lists,":[103],"enabling":[104],"assessment":[107],"models'":[109],"ability":[110],"both":[112,173],"describe":[115],"visual":[116],"changes.":[117],"Through":[118],"extensive":[119],"state-of-the-art":[122],"MLLMs,":[123],"reveal":[125],"significant":[126],"performance":[127],"gaps":[128],"proprietary":[130],"open-source":[132],"models,":[133],"highlight":[134],"the":[135],"critical":[136],"importance":[137],"reasoning":[139],"capability,":[140],"clear":[143],"limitations":[144],"model":[146],"scaling.":[147],"Our":[148],"framework":[149,178],"also":[150],"demonstrates":[151],"strong":[152,159],"alignment":[153],"with":[154,161],"human":[155],"expert":[156],"judgments":[157],"correlation":[160],"downstream":[162,184],"construction":[166],"quality.":[167],"These":[168],"findings":[169],"establish":[170],"DiffCap-Bench":[171],"reliable":[175],"practical":[181],"predictor":[182],"utility.":[185],"The":[186],"code":[189],"will":[190],"be":[191],"made":[192],"publicly":[193],"available":[194],"support":[196],"further":[197],"research.":[198]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-08T00:00:00"}
