{"id":"https://openalex.org/W7140856806","doi":"https://doi.org/10.48550/arxiv.2603.23896","title":"MMTIT-Bench: A Multilingual and Multi-Scenario Benchmark with Cognition-Perception-Reasoning Guided Text-Image Machine Translation","display_name":"MMTIT-Bench: A Multilingual and Multi-Scenario Benchmark with Cognition-Perception-Reasoning Guided Text-Image Machine Translation","publication_year":2026,"publication_date":"2026-03-25","ids":{"openalex":"https://openalex.org/W7140856806","doi":"https://doi.org/10.48550/arxiv.2603.23896"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.23896","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23896","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.23896","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058261095","display_name":"Gengluo Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Gengluo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130673188","display_name":"Chengquan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Chengquan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016559205","display_name":"Yupu Liang","orcid":"https://orcid.org/0000-0003-4074-463X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Yupu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122369826","display_name":"Huawen Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Huawen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130714088","display_name":"Yaping Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yaping","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027156714","display_name":"Pengyuan Lyu","orcid":"https://orcid.org/0000-0003-3153-8519"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Pengyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130679526","display_name":"Weinong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Weinong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102652743","display_name":"Xingyu Wan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wan, Xingyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046229717","display_name":"Gangyan Zeng","orcid":"https://orcid.org/0000-0003-2696-8549"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Gangyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130643804","display_name":"Han Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Han","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130667361","display_name":"Can Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Can","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130695721","display_name":"Yu Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6259999871253967,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6259999871253967,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.1266999989748001,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.04450000077486038,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.6323000192642212},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6047999858856201},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.5213000178337097},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.4867999851703644},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4787999987602234},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.4569999873638153},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.38690000772476196}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7760999798774719},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.6323000192642212},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6187000274658203},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6047999858856201},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5845999717712402},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.5213000178337097},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4867999851703644},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4787999987602234},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.4569999873638153},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.38690000772476196},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3666999936103821},{"id":"https://openalex.org/C2780035574","wikidata":"https://www.wikidata.org/wiki/Q30081","display_name":"Multilingualism","level":2,"score":0.3531000018119812},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.3452000021934509},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.3260999917984009},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3165000081062317},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.2962000072002411},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C53893814","wikidata":"https://www.wikidata.org/wiki/Q7378909","display_name":"Rule-based machine translation","level":2,"score":0.2773999869823456}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.23896","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23896","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.23896","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23896","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.5238649845123291,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"End-to-end":[0],"text-image":[1],"machine":[2],"translation":[3,112,145],"(TIMT),":[4],"which":[5],"directly":[6],"translates":[7],"textual":[8],"content":[9],"in":[10,23,113,177],"images":[11,54],"across":[12,29],"languages,":[13],"is":[14],"crucial":[15],"for":[16,101,132],"real-world":[17],"multilingual":[18,48,188],"scene":[19,140],"understanding.":[20],"Despite":[21],"advances":[22],"vision-language":[24],"large":[25],"models":[26,173],"(VLLMs),":[27],"robustness":[28],"diverse":[30,62],"visual":[31,124],"scenes":[32],"and":[33,49,58,61,68,111,144,171,179,189],"low-resource":[34],"languages":[35,60],"remains":[36],"underexplored":[37],"due":[38],"to":[39,92,127,185],"limited":[40],"evaluation":[41],"resources.":[42],"We":[43,129,181],"present":[44],"MMTIT-Bench,":[45],"a":[46,114,135,148,153],"human-verified":[47],"multi-scenario":[50,190],"benchmark":[51],"with":[52,166],"1,400":[53],"spanning":[55],"fourteen":[56],"non-English":[57],"non-Chinese":[59],"settings":[63],"such":[64],"as":[65],"documents,":[66],"scenes,":[67],"web":[69],"images,":[70],"enabling":[71],"rigorous":[72],"assessment":[73],"of":[74],"end-to-end":[75],"TIMT.":[76],"Beyond":[77],"benchmarking,":[78],"we":[79],"study":[80],"how":[81],"reasoning-oriented":[82],"data":[83,136,155],"design":[84],"improves":[85],"translation.":[86],"Although":[87],"recent":[88],"VLLMs":[89],"have":[90],"begun":[91],"incorporate":[93],"long":[94],"Chain-of-Thought":[95],"(CoT)":[96],"reasoning,":[97,121],"effective":[98],"thinking":[99],"paradigms":[100],"TIMT":[102,191],"are":[103],"still":[104],"immature:":[105],"existing":[106],"designs":[107],"either":[108],"cascade":[109],"parsing":[110],"sequential":[115],"manner":[116],"or":[117],"focus":[118],"on":[119,169],"language-only":[120],"overlooking":[122],"the":[123,187],"cognition":[125],"central":[126],"VLLMs.":[128],"propose":[130],"Cognition-Perception-Reasoning":[131],"Translation":[133],"(CPR-Trans),":[134],"paradigm":[137],"that":[138,163],"integrates":[139],"cognition,":[141],"text":[142],"perception,":[143],"reasoning":[146,150],"within":[147],"unified":[149],"process.":[151],"Using":[152],"VLLM-driven":[154],"generation":[156],"pipeline,":[157],"CPR-Trans":[158],"provides":[159],"structured,":[160],"interpretable":[161],"supervision":[162],"aligns":[164],"perception":[165],"reasoning.":[167],"Experiments":[168],"3B":[170],"7B":[172],"show":[174],"consistent":[175],"gains":[176],"accuracy":[178],"interpretability.":[180],"will":[182],"release":[183],"MMTIT-Bench":[184],"promote":[186],"research":[192],"upon":[193],"acceptance.":[194]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-27T00:00:00"}
