{"id":"https://openalex.org/W4415916254","doi":"https://doi.org/10.48550/arxiv.2510.27164","title":"Generating Accurate and Detailed Captions for High-Resolution Images","display_name":"Generating Accurate and Detailed Captions for High-Resolution Images","publication_year":2025,"publication_date":"2025-10-31","ids":{"openalex":"https://openalex.org/W4415916254","doi":"https://doi.org/10.48550/arxiv.2510.27164"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2510.27164","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.27164","pdf_url":"https://arxiv.org/pdf/2510.27164","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2510.27164","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5028450684","display_name":"Hankyeol Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lee, Hankyeol","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Seo, Gawon","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Seo, Gawon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004643226","display_name":"Kyounggyu Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Kyounggyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039942348","display_name":"Dogun Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Dogun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025711483","display_name":"Kyungwoo Song","orcid":"https://orcid.org/0000-0003-0082-4280"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Kyungwoo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5021834877","display_name":"Jiyoung Jung","orcid":"https://orcid.org/0000-0001-9316-9750"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jung, Jiyoung","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5028450684"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9002000093460083,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9002000093460083,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.03319999948143959,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.005499999970197678,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.7986000180244446},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.7495999932289124},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6044999957084656},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5788000226020813},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5311999917030334},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.49639999866485596},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.48579999804496765},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.45820000767707825}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8181999921798706},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.7986000180244446},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.7495999932289124},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7127000093460083},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6044999957084656},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5788000226020813},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5703999996185303},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5311999917030334},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.49639999866485596},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.48579999804496765},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.45820000767707825},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.38989999890327454},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3822000026702881},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.36649999022483826},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.34130001068115234},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2948000133037567},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2694999873638153},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.2644999921321869},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2581000030040741},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.25450000166893005}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2510.27164","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.27164","pdf_url":"https://arxiv.org/pdf/2510.27164","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2510.27164","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2510.27164","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2510.27164","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.27164","pdf_url":"https://arxiv.org/pdf/2510.27164","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-language":[0],"models":[1,62],"(VLMs)":[2],"often":[3],"struggle":[4],"to":[5,30,68,113,142,158],"generate":[6],"accurate":[7],"and":[8,41,64,95,120,169,198],"detailed":[9,197],"captions":[10,76,165,201],"for":[11,180],"high-resolution":[12,28,84,189],"images":[13,29,190],"since":[14],"they":[15,144],"are":[16,101,123,145],"typically":[17],"pre-trained":[18],"on":[19,184],"low-resolution":[20],"inputs":[21],"(e.g.,":[22],"224x224":[23],"or":[24],"336x336":[25],"pixels).":[26],"Downscaling":[27],"these":[31,121],"dimensions":[32],"may":[33],"result":[34],"in":[35,98,134],"the":[36,42,99,116,135,163],"loss":[37],"of":[38,44,188],"visual":[39],"details":[40],"omission":[43],"important":[45],"objects.":[46,160],"To":[47],"address":[48],"this":[49],"limitation,":[50],"we":[51],"propose":[52],"a":[53,78,83,93,178,185],"novel":[54],"pipeline":[55,74,194],"that":[56,192],"integrates":[57],"vision-language":[58],"models,":[59,175],"large":[60,173],"language":[61],"(LLMs),":[63],"object":[65,126],"detection":[66,127],"systems":[67],"enhance":[69],"caption":[70,88,137,150],"quality.":[71],"Our":[72],"proposed":[73],"refines":[75],"through":[77],"novel,":[79],"multi-stage":[80],"process.":[81],"Given":[82],"image,":[85],"an":[86,105],"initial":[87,136],"is":[89],"first":[90],"generated":[91],"using":[92,166],"VLM,":[94],"key":[96,118],"objects":[97,111,131],"image":[100,200],"then":[102],"identified":[103,117],"by":[104,125,155],"LLM.":[106],"The":[107],"LLM":[108],"predicts":[109],"additional":[110],"likely":[112],"co-occur":[114],"with":[115,177],"objects,":[119],"predictions":[122],"verified":[124],"systems.":[128],"Newly":[129],"detected":[130],"not":[132],"mentioned":[133],"undergo":[138],"focused,":[139],"region-specific":[140],"captioning":[141],"ensure":[143],"incorporated.":[146],"This":[147],"process":[148],"enriches":[149],"detail":[151],"while":[152,202],"reducing":[153],"hallucinations":[154],"removing":[156],"references":[157],"undetected":[159],"We":[161],"evaluate":[162],"enhanced":[164],"pairwise":[167],"comparison":[168],"quantitative":[170],"scoring":[171],"from":[172],"multimodal":[174],"along":[176],"benchmark":[179],"hallucination":[181],"detection.":[182],"Experiments":[183],"curated":[186],"dataset":[187],"demonstrate":[191],"our":[193],"produces":[195],"more":[196],"reliable":[199],"effectively":[203],"minimizing":[204],"hallucinations.":[205]},"counts_by_year":[],"updated_date":"2026-04-17T18:11:37.981687","created_date":"2025-11-05T00:00:00"}
