{"id":"https://openalex.org/W4416158751","doi":"https://doi.org/10.48550/arxiv.2511.07010","title":"A Picture is Worth a Thousand (Correct) Captions: A Vision-Guided Judge-Corrector System for Multimodal Machine Translation","display_name":"A Picture is Worth a Thousand (Correct) Captions: A Vision-Guided Judge-Corrector System for Multimodal Machine Translation","publication_year":2025,"publication_date":"2025-11-10","ids":{"openalex":"https://openalex.org/W4416158751","doi":"https://doi.org/10.48550/arxiv.2511.07010"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2511.07010","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.07010","pdf_url":"https://arxiv.org/pdf/2511.07010","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2511.07010","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093193657","display_name":"Siddharth Betala","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Betala, Siddharth","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009915381","display_name":"Shashi Raj K","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Raj, Kushan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120595448","display_name":"Vipul Betala","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Betala, Vipul","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5120444400","display_name":"Rohan Saswade","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Saswade, Rohan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5093193657"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8173999786376953,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8173999786376953,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.061900001019239426,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.06030000001192093,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.7412999868392944},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.7059999704360962},{"id":"https://openalex.org/keywords/bleu","display_name":"BLEU","score":0.6851999759674072},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6639999747276306},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6365000009536743},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.6223000288009644},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.5932999849319458},{"id":"https://openalex.org/keywords/data-set","display_name":"Data set","score":0.4611999988555908}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8108999729156494},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.7412999868392944},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.7059999704360962},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6898999810218811},{"id":"https://openalex.org/C622187","wikidata":"https://www.wikidata.org/wiki/Q3500773","display_name":"BLEU","level":3,"score":0.6851999759674072},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6646000146865845},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6639999747276306},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6365000009536743},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.6223000288009644},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.5932999849319458},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.4611999988555908},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.4514999985694885},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.438400000333786},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4178999960422516},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39730000495910645},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.382999986410141},{"id":"https://openalex.org/C135784402","wikidata":"https://www.wikidata.org/wiki/Q6958279","display_name":"Evaluation of machine translation","level":5,"score":0.3785000145435333},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.35910001397132874},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3522999882698059},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.30550000071525574},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2615000009536743},{"id":"https://openalex.org/C2986862884","wikidata":"https://www.wikidata.org/wiki/Q7553","display_name":"Language translation","level":3,"score":0.2596000134944916}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2511.07010","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.07010","pdf_url":"https://arxiv.org/pdf/2511.07010","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2511.07010","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.07010","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2511.07010","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.07010","pdf_url":"https://arxiv.org/pdf/2511.07010","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416158751.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3],"describe":[4],"our":[5],"system":[6],"under":[7],"the":[8,14,25,47,81,152,180,189,199,210],"team":[9],"name":[10],"BLEU":[11,172],"Monday":[12],"for":[13,29,177,196,207],"English-to-Indic":[15],"Multimodal":[16],"Translation":[17],"Task":[18],"at":[19],"WAT":[20],"2025.":[21],"We":[22,37,144],"participate":[23],"in":[24,46,80],"text-only":[26],"translation":[27,78,101,122],"tasks":[28],"English-Hindi,":[30],"English-Bengali,":[31],"English-Malayalam,":[32],"and":[33,54,76,161,186,205],"English-Odia":[34,197],"language":[35,71],"pairs.":[36],"present":[38],"a":[39,64],"two-stage":[40],"approach":[41],"that":[42,68],"addresses":[43],"quality":[44,123],"issues":[45],"training":[48,82,130],"data":[49,167],"through":[50],"automated":[51,126],"error":[52],"detection":[53],"correction,":[55],"followed":[56],"by":[57],"parameter-efficient":[58],"model":[59,157],"fine-tuning.":[60],"Our":[61],"methodology":[62],"introduces":[63],"vision-augmented":[65],"judge-corrector":[66],"pipeline":[67,127],"leverages":[69],"multimodal":[70],"models":[72],"to":[73,107,150],"systematically":[74],"identify":[75],"correct":[77],"errors":[79,104],"data.":[83],"The":[84],"judge":[85],"component":[86],"classifies":[87],"translations":[88],"into":[89],"three":[90],"categories:":[91],"correct,":[92],"visually":[93],"ambiguous":[94],"(requiring":[95],"image":[96],"context),":[97],"or":[98],"mistranslated":[99],"(poor":[100],"quality).":[102],"Identified":[103],"are":[105],"routed":[106],"specialized":[108],"correctors:":[109],"GPT-4o-mini":[110],"regenerates":[111],"captions":[112,141],"requiring":[113],"visual":[114],"disambiguation,":[115],"while":[116],"IndicTrans2":[117,153],"retranslates":[118],"cases":[119],"with":[120,171],"pure":[121],"issues.":[124],"This":[125],"processes":[128],"28,928":[129],"examples":[131],"across":[132],"four":[133],"languages,":[134],"correcting":[135],"an":[136],"average":[137],"of":[138,140,175],"17.1%":[139],"per":[142],"language.":[143],"then":[145],"apply":[146],"Low-Rank":[147],"Adaptation":[148],"(LoRA)":[149],"fine-tune":[151],"en-indic":[154],"200M":[155],"distilled":[156],"on":[158,165,179,188,198,209],"both":[159],"original":[160],"corrected":[162,166],"datasets.":[163],"Training":[164],"yields":[168],"consistent":[169],"improvements,":[170],"score":[173],"gains":[174],"+1.30":[176],"English-Bengali":[178],"evaluation":[181,200],"set":[182,191,201,212],"(42.00":[183],"-&gt;":[184,193,203,214],"43.30)":[185],"+0.70":[187],"challenge":[190,211],"(44.90":[192],"45.60),":[194],"+0.60":[195],"(41.00":[202],"41.60),":[204],"+0.10":[206],"English-Hindi":[208],"(53.90":[213],"54.00).":[215]},"counts_by_year":[],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2025-11-12T00:00:00"}
