{"id":"https://openalex.org/W7154380296","doi":"https://doi.org/10.48550/arxiv.2604.10784","title":"TorchUMM: A Unified Multimodal Model Codebase for Evaluation, Analysis, and Post-training","display_name":"TorchUMM: A Unified Multimodal Model Codebase for Evaluation, Analysis, and Post-training","publication_year":2026,"publication_date":"2026-04-12","ids":{"openalex":"https://openalex.org/W7154380296","doi":"https://doi.org/10.48550/arxiv.2604.10784"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.10784","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10784","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.10784","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133575765","display_name":"Yinyi Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Luo, Yinyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133595368","display_name":"Wenwen Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Wenwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133607514","display_name":"Hayes Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Hayes","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133621745","display_name":"Hongyu Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Hongyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133580156","display_name":"Hao Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133553907","display_name":"Pan He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Pan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133591829","display_name":"Marios Savvides","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Savvides, Marios","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133599629","display_name":"Sharon Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Sharon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133576813","display_name":"Jindong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jindong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5133575765"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8644000291824341,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8644000291824341,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0617000013589859,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.009800000116229057,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codebase","display_name":"Codebase","score":0.8256999850273132},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.532800018787384},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5306000113487244},{"id":"https://openalex.org/keywords/multimodality","display_name":"Multimodality","score":0.5030999779701233},{"id":"https://openalex.org/keywords/unified-model","display_name":"Unified Model","score":0.43799999356269836},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.42309999465942383},{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.3815000057220459},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.3449000120162964}],"concepts":[{"id":"https://openalex.org/C51929080","wikidata":"https://www.wikidata.org/wiki/Q2425187","display_name":"Codebase","level":3,"score":0.8256999850273132},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7792999744415283},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.532800018787384},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5306000113487244},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.5030999779701233},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.44369998574256897},{"id":"https://openalex.org/C45493050","wikidata":"https://www.wikidata.org/wiki/Q7884934","display_name":"Unified Model","level":2,"score":0.43799999356269836},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.42309999465942383},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3887999951839447},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.3815000057220459},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.3449000120162964},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3352000117301941},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.32499998807907104},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.31859999895095825},{"id":"https://openalex.org/C3020442560","wikidata":"https://www.wikidata.org/wiki/Q4971815","display_name":"Broad spectrum","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2793999910354614},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2721000015735626},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.2703999876976013},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2644999921321869},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.2639999985694885},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.2583000063896179},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.2565999925136566},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2540000081062317}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.10784","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10784","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.10784","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10784","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2],"unified":[3,28,58,119,149],"multimodal":[4,96,150],"models":[5,79,133],"(UMMs)":[6],"have":[7],"led":[8],"to":[9,35,108],"a":[10,27,75,81,118],"proliferation":[11],"of":[12,15,38,44,78,84,146],"architectures":[13,40],"capable":[14,148],"understanding,":[16,97],"generating,":[17],"and":[18,22,41,47,64,71,86,99,101,105,113,121,128,134,141],"editing":[19],"across":[20,66,131],"visual":[21],"textual":[23],"modalities.":[24],"However,":[25],"developing":[26],"framework":[29],"for":[30,60],"UMMs":[31],"remains":[32],"challenging":[33],"due":[34],"the":[36,42,56,144],"diversity":[37],"model":[39],"heterogeneity":[43],"training":[45],"paradigms":[46],"implementation":[48],"details.":[49],"In":[50],"this":[51],"paper,":[52],"we":[53],"present":[54],"TorchUMM,":[55],"first":[57],"codebase":[59],"comprehensive":[61],"evaluation,":[62],"analysis,":[63],"post-training":[65],"diverse":[67],"UMM":[68],"backbones,":[69],"tasks,":[70],"datasets.":[72],"TorchUMM":[73,125],"supports":[74],"broad":[76],"spectrum":[77],"covering":[80],"wide":[82],"range":[83],"scales":[85],"design":[87],"paradigms.":[88],"Our":[89],"benchmark":[90],"encompasses":[91],"three":[92],"core":[93],"task":[94],"dimensions:":[95],"generation,":[98],"editing,":[100],"integrates":[102],"both":[103],"established":[104],"novel":[106],"datasets":[107],"evaluate":[109],"perception,":[110],"reasoning,":[111],"compositionality,":[112],"instruction-following":[114],"abilities.":[115],"By":[116],"providing":[117],"interface":[120],"standardized":[122],"evaluation":[123],"protocols,":[124],"enables":[126],"fair":[127],"reproducible":[129],"comparisons":[130],"heterogeneous":[132],"fosters":[135],"deeper":[136],"insights":[137],"into":[138],"their":[139],"strengths":[140],"limitations,":[142],"facilitating":[143],"development":[145],"more":[147],"systems.":[151],"Code":[152],"is":[153],"available":[154],"at:":[155],"https://github.com/AIFrontierLab/TorchUMM.":[156]},"counts_by_year":[],"updated_date":"2026-04-15T06:04:33.058270","created_date":"2026-04-15T00:00:00"}
