{"id":"https://openalex.org/W7080134903","doi":"https://doi.org/10.48550/arxiv.2509.03897","title":"SPECS: Specificity-Enhanced CLIP-Score for Long Image Caption Evaluation","display_name":"SPECS: Specificity-Enhanced CLIP-Score for Long Image Caption Evaluation","publication_year":2025,"publication_date":"2025-09-04","ids":{"openalex":"https://openalex.org/W7080134903","doi":"https://doi.org/10.48550/arxiv.2509.03897"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2509.03897","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.03897","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2509.03897","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Chen, Xiaofu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Xiaofu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Salazar, Israfel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Salazar, Israfel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Kementchedjhieva, Yova","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kementchedjhieva, Yova","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T12157","display_name":"Geochemistry and Geologic Mapping","score":0.6341999769210815,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12157","display_name":"Geochemistry and Geologic Mapping","score":0.6341999769210815,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13067","display_name":"Geological Modeling and Analysis","score":0.027499999850988388,"subfield":{"id":"https://openalex.org/subfields/1906","display_name":"Geochemistry and Petrology"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14311","display_name":"Electrical and Electromagnetic Research","score":0.023099999874830246,"subfield":{"id":"https://openalex.org/subfields/3107","display_name":"Atomic and Molecular Physics, and Optics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.7820000052452087},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.6392999887466431},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.6248999834060669},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.5221999883651733},{"id":"https://openalex.org/keywords/correlation","display_name":"Correlation","score":0.3944000005722046},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.38659998774528503},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.3336000144481659},{"id":"https://openalex.org/keywords/image-quality","display_name":"Image quality","score":0.3199000060558319}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.786300003528595},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.7820000052452087},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.6392999887466431},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.6248999834060669},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.5221999883651733},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4666000008583069},{"id":"https://openalex.org/C117220453","wikidata":"https://www.wikidata.org/wiki/Q5172842","display_name":"Correlation","level":2,"score":0.3944000005722046},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.38659998774528503},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3336000144481659},{"id":"https://openalex.org/C55020928","wikidata":"https://www.wikidata.org/wiki/Q3813865","display_name":"Image quality","level":3,"score":0.3199000060558319},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.31690001487731934},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3125999867916107},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3122999966144562},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.30820000171661377},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.30329999327659607},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3003999888896942},{"id":"https://openalex.org/C160086991","wikidata":"https://www.wikidata.org/wiki/Q5939193","display_name":"Human visual system model","level":3,"score":0.2904999852180481},{"id":"https://openalex.org/C143271835","wikidata":"https://www.wikidata.org/wiki/Q254515","display_name":"Similitude","level":2,"score":0.28380000591278076},{"id":"https://openalex.org/C159694833","wikidata":"https://www.wikidata.org/wiki/Q2321565","display_name":"Iterative method","level":2,"score":0.2750000059604645},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.27410000562667847},{"id":"https://openalex.org/C106430172","wikidata":"https://www.wikidata.org/wiki/Q6002272","display_name":"Image restoration","level":4,"score":0.2632000148296356},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2572999894618988},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.2542000114917755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2509.03897","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.03897","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2509.03897","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.03897","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.5707608461380005}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"interest":[1],"grows":[2],"in":[3,45,123],"generating":[4],"long,":[5],"detailed":[6],"image":[7,93,144],"captions,":[8],"standard":[9],"evaluation":[10,142],"metrics":[11,16,58,122],"become":[12],"increasingly":[13],"unreliable.":[14],"N-gram-based":[15],"though":[17],"efficient,":[18],"fail":[19],"to":[20,29,37,51,54,91,125],"capture":[21],"semantic":[22],"correctness.":[23],"Representational":[24],"Similarity":[25],"(RS)":[26],"metrics,":[27],"designed":[28],"address":[30],"this,":[31],"initially":[32],"saw":[33],"limited":[34],"use":[35,77],"due":[36,50],"high":[38],"computational":[39],"costs,":[40],"while":[41,128],"today,":[42],"despite":[43],"advances":[44],"hardware,":[46],"they":[47],"remain":[48,72],"unpopular":[49],"low":[52],"correlation":[53,67,124],"human":[55,69,126],"judgments.":[56],"Meanwhile,":[57],"based":[59],"on":[60],"large":[61],"language":[62],"models":[63],"(LLMs)":[64],"show":[65,113],"strong":[66],"with":[68,98],"judgments,":[70,127],"but":[71],"too":[73],"expensive":[74],"for":[75,139],"iterative":[76,140],"during":[78,143],"model":[79,146],"development.":[80],"We":[81,112],"introduce":[82],"SPECS":[83,95,115],"(Specificity-Enhanced":[84],"CLIPScore),":[85],"a":[86,99,136],"reference-free":[87],"RS":[88],"metric":[89],"tailored":[90],"long":[92],"captioning.":[94],"modifies":[96],"CLIP":[97],"new":[100],"objective":[101],"that":[102,114],"emphasizes":[103],"specificity:":[104],"rewarding":[105],"correct":[106],"details":[107],"and":[108],"penalizing":[109],"incorrect":[110],"ones.":[111],"matches":[116],"the":[117],"performance":[118],"of":[119],"open-source":[120],"LLM-based":[121],"being":[129],"far":[130],"more":[131],"efficient.":[132],"This":[133],"makes":[134],"it":[135],"practical":[137],"alternative":[138],"checkpoint":[141],"captioning":[145],"development.Our":[147],"code":[148],"can":[149],"be":[150],"found":[151],"at":[152],"https://github.com/mbzuai-nlp/SPECS.":[153]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
