{"id":"https://openalex.org/W7126103848","doi":"https://doi.org/10.1109/tmm.2026.3659297","title":"TMT: Tri-Modal Translation Between Speech, Image, and Text by Processing Different Modalities as Different Languages","display_name":"TMT: Tri-Modal Translation Between Speech, Image, and Text by Processing Different Modalities as Different Languages","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7126103848","doi":"https://doi.org/10.1109/tmm.2026.3659297"},"language":null,"primary_location":{"id":"doi:10.1109/tmm.2026.3659297","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2026.3659297","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124204280","display_name":"Minsu Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Minsu Kim","raw_affiliation_strings":["Integrated Vision and Language Lab., School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea"],"raw_orcid":"https://orcid.org/0000-0002-6514-0018","affiliations":[{"raw_affiliation_string":"Integrated Vision and Language Lab., School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124242658","display_name":"Jee-weon Jung","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jee-weon Jung","raw_affiliation_strings":["Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA"],"raw_orcid":"https://orcid.org/0000-0003-0505-2988","affiliations":[{"raw_affiliation_string":"Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124151384","display_name":"Hyeongseop Rha","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Hyeongseop Rha","raw_affiliation_strings":["Integrated Vision and Language Lab., School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0004-9301-2760","affiliations":[{"raw_affiliation_string":"Integrated Vision and Language Lab., School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010858961","display_name":"Soumi Maiti","orcid":"https://orcid.org/0000-0001-6940-0115"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Soumi Maiti","raw_affiliation_strings":["Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA"],"raw_orcid":"https://orcid.org/0000-0001-6940-0115","affiliations":[{"raw_affiliation_string":"Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047892839","display_name":"Siddhant Arora","orcid":"https://orcid.org/0000-0003-0375-496X"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Siddhant Arora","raw_affiliation_strings":["Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050058892","display_name":"Xuankai Chang","orcid":"https://orcid.org/0000-0002-5221-5412"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xuankai Chang","raw_affiliation_strings":["Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA"],"raw_orcid":"https://orcid.org/0000-0002-5221-5412","affiliations":[{"raw_affiliation_string":"Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123785135","display_name":"Shinji Watanabe","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5124190178","display_name":"Yong Man Ro","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Yong Man Ro","raw_affiliation_strings":["Integrated Vision and Language Lab., School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea"],"raw_orcid":"https://orcid.org/0000-0001-5306-6853","affiliations":[{"raw_affiliation_string":"Integrated Vision and Language Lab., School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea","institution_ids":["https://openalex.org/I157485424"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.1156369,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"28","issue":null,"first_page":"1976","last_page":"1988"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.4043000042438507,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.4043000042438507,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.24120000004768372,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.04919999837875366,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.7773000001907349},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.7085000276565552},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.6488999724388123},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.6201000213623047},{"id":"https://openalex.org/keywords/image-translation","display_name":"Image translation","score":0.5577999949455261},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5142999887466431},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5037999749183655},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.43070000410079956}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.854200005531311},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.7773000001907349},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.7085000276565552},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.6488999724388123},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.6201000213623047},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6047000288963318},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5967000126838684},{"id":"https://openalex.org/C2779757391","wikidata":"https://www.wikidata.org/wiki/Q6002292","display_name":"Image translation","level":3,"score":0.5577999949455261},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5205000042915344},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5142999887466431},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5037999749183655},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.43070000410079956},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.388700008392334},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.3628000020980835},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3472000062465668},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.33980000019073486},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.32659998536109924},{"id":"https://openalex.org/C2986862884","wikidata":"https://www.wikidata.org/wiki/Q7553","display_name":"Language translation","level":3,"score":0.32249999046325684},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3091999888420105},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2946000099182129},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.27230000495910645},{"id":"https://openalex.org/C53893814","wikidata":"https://www.wikidata.org/wiki/Q7378909","display_name":"Rule-based machine translation","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C24687705","wikidata":"https://www.wikidata.org/wiki/Q3753284","display_name":"Example-based machine translation","level":3,"score":0.26350000500679016},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.2603999972343445},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2556999921798706},{"id":"https://openalex.org/C75294576","wikidata":"https://www.wikidata.org/wiki/Q5165192","display_name":"Contextual image classification","level":3,"score":0.25459998846054077}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2026.3659297","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2026.3659297","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5178579001","display_name":null,"funder_award_id":"NRF-2022R1A2C2005529","funder_id":"https://openalex.org/F4320322120","funder_display_name":"National Research Foundation of Korea"}],"funders":[{"id":"https://openalex.org/F4320322120","display_name":"National Research Foundation of Korea","ror":"https://ror.org/013aysd81"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":68,"referenced_works":["https://openalex.org/W68733909","https://openalex.org/W1861492603","https://openalex.org/W1905882502","https://openalex.org/W1956340063","https://openalex.org/W2101105183","https://openalex.org/W2108598243","https://openalex.org/W2121879602","https://openalex.org/W2133459682","https://openalex.org/W2506483933","https://openalex.org/W2886641317","https://openalex.org/W2950681488","https://openalex.org/W2962862718","https://openalex.org/W2963216553","https://openalex.org/W2963341956","https://openalex.org/W2963609956","https://openalex.org/W2972394484","https://openalex.org/W2972495969","https://openalex.org/W2995181338","https://openalex.org/W3001434439","https://openalex.org/W3015215494","https://openalex.org/W3037217258","https://openalex.org/W3043515420","https://openalex.org/W3089472875","https://openalex.org/W3095030004","https://openalex.org/W3140429000","https://openalex.org/W3155217823","https://openalex.org/W3170088426","https://openalex.org/W3173767661","https://openalex.org/W3174311593","https://openalex.org/W3174570731","https://openalex.org/W3175871055","https://openalex.org/W3176641147","https://openalex.org/W3180355996","https://openalex.org/W3180374548","https://openalex.org/W3205644108","https://openalex.org/W3206387123","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W3210177631","https://openalex.org/W4211154280","https://openalex.org/W4253028564","https://openalex.org/W4287854499","https://openalex.org/W4296070387","https://openalex.org/W4312393956","https://openalex.org/W4312933868","https://openalex.org/W4367359628","https://openalex.org/W4372260534","https://openalex.org/W4378602476","https://openalex.org/W4382999323","https://openalex.org/W4385245566","https://openalex.org/W4385822683","https://openalex.org/W4385990902","https://openalex.org/W4386071467","https://openalex.org/W4386071707","https://openalex.org/W4386076027","https://openalex.org/W4387609332","https://openalex.org/W4388017359","https://openalex.org/W4390871839","https://openalex.org/W4391164179","https://openalex.org/W4392903033","https://openalex.org/W4392904292","https://openalex.org/W4392904805","https://openalex.org/W4401043304","https://openalex.org/W4401607735","https://openalex.org/W4402716129","https://openalex.org/W4403780768","https://openalex.org/W4406417959","https://openalex.org/W7133185348"],"related_works":[],"abstract_inverted_index":{"The":[0],"capability":[1],"to":[2,137],"jointly":[3],"process":[4],"multi-modal":[5,14,29,69,103],"information":[6],"is":[7,16],"becoming":[8],"essential.":[9],"However,":[10],"the":[11,19,24,129],"development":[12],"of":[13,27],"learning":[15],"hindered":[17],"by":[18,98],"substantial":[20],"computational":[21,95],"requirements":[22],"and":[23,47,55,60,67,83,120,143],"limited":[25],"availability":[26],"paired":[28],"data.":[30],"We":[31,49],"propose":[32],"a":[33,51,72,91],"novel":[34],"Tri-Modal":[35],"Translation":[36],"(TMT)":[37],"model":[38],"that":[39],"translates":[40],"between":[41],"arbitrary":[42],"modalities":[43,62],"spanning":[44],"speech,":[45],"image,":[46],"text.":[48],"introduce":[50],"simple":[52],"yet":[53],"efficient":[54],"effective":[56],"approach,":[57],"treating":[58],"speech":[59,82,141],"image":[61,84,146],"as":[63,71],"discrete":[64,87],"text":[65],"modality":[66,117],"approaching":[68],"translation":[70,75,101,118],"well-established":[73],"machine":[74],"problem.":[76],"To":[77],"this":[78],"end,":[79],"we":[80],"tokenize":[81],"data":[85,106,131,142],"into":[86,102],"tokens,":[88],"resulting":[89],"in":[90,94],"significant":[92],"reduction":[93],"cost.":[96],"Furthermore,":[97],"incorporating":[99],"back":[100],"translation,":[104],"unpaired":[105],"can":[107,114],"also":[108],"be":[109],"utilized":[110],"for":[111,135,140,145],"training.":[112],"TMT":[113,126],"perform":[115],"six":[116],"tasks":[119],"consistently":[121],"outperforms":[122],"its":[123],"single-model":[124],"counterparts.":[125],"significantly":[127],"reduces":[128],"required":[130],"size":[132],"(in":[133],"bits)":[134],"training,":[136],"approximately":[138],"0.2%":[139],"0.04%":[144],"data,":[147],"respectively.":[148]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-30T00:00:00"}
