{"id":"https://openalex.org/W7160245916","doi":"https://doi.org/10.48550/arxiv.2605.00907","title":"TRIP-Evaluate: An Open Multimodal Benchmark for Evaluating Large Models in Transportation","display_name":"TRIP-Evaluate: An Open Multimodal Benchmark for Evaluating Large Models in Transportation","publication_year":2026,"publication_date":"2026-04-29","ids":{"openalex":"https://openalex.org/W7160245916","doi":"https://doi.org/10.48550/arxiv.2605.00907"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.00907","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.00907","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.00907","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135374981","display_name":"Han Gong","orcid":"https://orcid.org/0009-0007-9405-3427"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Han","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135315071","display_name":"Zhen Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zhen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135322029","display_name":"Yunyang Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Yunyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135288361","display_name":"Yan Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Yan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016310648","display_name":"Jinbiao Huo","orcid":"https://orcid.org/0000-0003-0022-420X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huo, Jinbiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135314893","display_name":"Qi Hong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hong, Qi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135408364","display_name":"Zhiyuan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhiyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.3887999951839447,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.3887999951839447,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.1185000017285347,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.07429999858140945,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7074999809265137},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.6270999908447266},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.6172000169754028},{"id":"https://openalex.org/keywords/safer","display_name":"SAFER","score":0.5223000049591064},{"id":"https://openalex.org/keywords/scope","display_name":"Scope (computer science)","score":0.44859999418258667},{"id":"https://openalex.org/keywords/strengths-and-weaknesses","display_name":"Strengths and weaknesses","score":0.420199990272522},{"id":"https://openalex.org/keywords/taxonomy","display_name":"Taxonomy (biology)","score":0.39500001072883606},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.3822999894618988},{"id":"https://openalex.org/keywords/anomaly-detection","display_name":"Anomaly detection","score":0.3822000026702881}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7074999809265137},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7044000029563904},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.6270999908447266},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.6172000169754028},{"id":"https://openalex.org/C2776654903","wikidata":"https://www.wikidata.org/wiki/Q2601463","display_name":"SAFER","level":2,"score":0.5223000049591064},{"id":"https://openalex.org/C2778012447","wikidata":"https://www.wikidata.org/wiki/Q1034415","display_name":"Scope (computer science)","level":2,"score":0.44859999418258667},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.42570000886917114},{"id":"https://openalex.org/C63882131","wikidata":"https://www.wikidata.org/wiki/Q17122954","display_name":"Strengths and weaknesses","level":2,"score":0.420199990272522},{"id":"https://openalex.org/C58642233","wikidata":"https://www.wikidata.org/wiki/Q8269924","display_name":"Taxonomy (biology)","level":2,"score":0.39500001072883606},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39430001378059387},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.3822999894618988},{"id":"https://openalex.org/C739882","wikidata":"https://www.wikidata.org/wiki/Q3560506","display_name":"Anomaly detection","level":2,"score":0.3822000026702881},{"id":"https://openalex.org/C187191949","wikidata":"https://www.wikidata.org/wiki/Q1138496","display_name":"Profiling (computer programming)","level":2,"score":0.36399999260902405},{"id":"https://openalex.org/C147494362","wikidata":"https://www.wikidata.org/wiki/Q2078905","display_name":"Troubleshooting","level":2,"score":0.36070001125335693},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.33709999918937683},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3361000120639801},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.3019999861717224},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.2964000105857849},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.2962000072002411},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.29159998893737793},{"id":"https://openalex.org/C2777974031","wikidata":"https://www.wikidata.org/wiki/Q493641","display_name":"Multimodal transport","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C74448152","wikidata":"https://www.wikidata.org/wiki/Q765633","display_name":"Aviation","level":2,"score":0.27079999446868896},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.26019999384880066},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.25940001010894775},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.257999986410141},{"id":"https://openalex.org/C184356942","wikidata":"https://www.wikidata.org/wiki/Q830382","display_name":"Best practice","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.00907","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.00907","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.00907","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.00907","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,7,98,173],"(LLMs)":[3],"and":[4,25,36,74,82,115,125,148,161,193,202,211],"multimodal":[5,94,190],"large":[6,97],"(MLLMs)":[8],"are":[9,32],"increasingly":[10],"used":[11],"for":[12,96,206],"transportation":[13,30,68,215],"tasks":[14],"such":[15],"as":[16],"regulation":[17],"question":[18],"answering,":[19],"traffic":[20,59],"management":[21],"support,":[22],"engineering":[23,55,186],"review,":[24],"autonomous-driving":[26],"scene":[27,191],"reasoning.":[28],"Yet":[29],"workflows":[31],"rule-intensive,":[33],"computation-intensive,":[34],"safety-critical,":[35],"inherently":[37],"multimodal.":[38],"Existing":[39],"general":[40],"benchmarks":[41,69],"provide":[42],"limited":[43],"evidence":[44],"of":[45,66,172],"whether":[46],"a":[47,107,169,199],"model":[48,207],"can":[49],"apply":[50],"regulations":[51],"correctly,":[52],"perform":[53],"verifiable":[54],"calculations,":[56],"or":[57],"interpret":[58],"scenes":[60],"reliably,":[61],"while":[62],"the":[63],"small":[64],"number":[65],"public":[67],"remain":[70,183],"narrow":[71],"in":[72,99,184,214],"scope":[73],"rarely":[75],"support":[76],"fine-grained":[77],"diagnosis":[78,129],"across":[79],"text,":[80],"images,":[81],"point-cloud":[83,150,194],"data.":[84],"To":[85],"address":[86],"this":[87],"gap,":[88],"we":[89],"present":[90],"TRIP-Evaluate,":[91],"an":[92],"open":[93],"benchmark":[95,102],"transportation.":[100],"The":[101,138],"organizes":[103],"837":[104],"items":[105],"using":[106],"role-task-knowledge":[108],"taxonomy":[109],"that":[110,175],"covers":[111],"vehicle,":[112],"traffic-management,":[113],"traveler,":[114],"planning-and-design":[116],"functions.":[117],"Each":[118],"item":[119,155],"is":[120,178],"annotated":[121],"with":[122],"capability,":[123],"modality,":[124],"difficulty":[126],"labels,":[127],"enabling":[128],"from":[130],"overall":[131],"accuracy":[132],"down":[133],"to":[134,163],"specific":[135],"failure":[136],"modes.":[137],"current":[139],"release":[140],"includes":[141],"596":[142],"text":[143],"items,":[144,147],"198":[145],"image":[146],"43":[149],"items.":[151],"TRIP-Evaluate":[152,197],"also":[153],"standardizes":[154],"construction,":[156],"quality":[157],"control,":[158],"prompting,":[159],"decoding,":[160],"scoring":[162],"improve":[164],"cross-model":[165],"comparability.":[166],"Results":[167],"on":[168],"diverse":[170],"panel":[171],"show":[174],"text-based":[176],"performance":[177],"improving,":[179],"but":[180],"substantial":[181],"weaknesses":[182],"multi-step":[185],"calculation,":[187],"rule-constrained":[188],"reasoning,":[189],"understanding,":[192],"understanding.":[195],"Overall,":[196],"provides":[198],"reproducible,":[200],"diagnosable,":[201],"engineering-aligned":[203],"evaluation":[204],"baseline":[205],"selection,":[208],"regression":[209],"testing,":[210],"safer":[212],"deployment":[213],"applications.":[216]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-06T00:00:00"}
