{"id":"https://openalex.org/W7153048149","doi":"https://doi.org/10.48550/arxiv.2604.07355","title":"Prediction Arena: Benchmarking AI Models on Real-World Prediction Markets","display_name":"Prediction Arena: Benchmarking AI Models on Real-World Prediction Markets","publication_year":2026,"publication_date":"2026-03-28","ids":{"openalex":"https://openalex.org/W7153048149","doi":"https://doi.org/10.48550/arxiv.2604.07355"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.07355","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07355","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.07355","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045556183","display_name":"Jaden Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jaden","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011697388","display_name":"Gardenia Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Gardenia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133316206","display_name":"Oliver Johansson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Johansson, Oliver","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133383350","display_name":"Hileamlak Yitayew","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yitayew, Hileamlak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133391147","display_name":"Kamryn Ohly","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ohly, Kamryn","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133386789","display_name":"Grace Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Grace","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11326","display_name":"Stock Market Forecasting Methods","score":0.4580000042915344,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11326","display_name":"Stock Market Forecasting Methods","score":0.4580000042915344,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11674","display_name":"Sports Analytics and Performance","score":0.10530000180006027,"subfield":{"id":"https://openalex.org/subfields/2002","display_name":"Economics and Econometrics"},"field":{"id":"https://openalex.org/fields/20","display_name":"Economics, Econometrics and Finance"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.07010000199079514,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.7649000287055969},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6909999847412109},{"id":"https://openalex.org/keywords/prediction-market","display_name":"Prediction market","score":0.5295000076293945},{"id":"https://openalex.org/keywords/predictive-modelling","display_name":"Predictive modelling","score":0.49129998683929443},{"id":"https://openalex.org/keywords/outcome","display_name":"Outcome (game theory)","score":0.43160000443458557},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.427700012922287},{"id":"https://openalex.org/keywords/settlement","display_name":"Settlement (finance)","score":0.4020000100135803},{"id":"https://openalex.org/keywords/frontier","display_name":"Frontier","score":0.37279999256134033},{"id":"https://openalex.org/keywords/ground-truth","display_name":"Ground truth","score":0.3684000074863434}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.7649000287055969},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6909999847412109},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.609000027179718},{"id":"https://openalex.org/C2780021719","wikidata":"https://www.wikidata.org/wiki/Q282283","display_name":"Prediction market","level":2,"score":0.5295000076293945},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.5113999843597412},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.49129998683929443},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45010000467300415},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.43619999289512634},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.43160000443458557},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.427700012922287},{"id":"https://openalex.org/C2777063073","wikidata":"https://www.wikidata.org/wiki/Q1553237","display_name":"Settlement (finance)","level":3,"score":0.4020000100135803},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.37279999256134033},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.3684000074863434},{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.34929999709129333},{"id":"https://openalex.org/C180075932","wikidata":"https://www.wikidata.org/wiki/Q5333342","display_name":"Econometric model","level":2,"score":0.3407999873161316},{"id":"https://openalex.org/C2779714256","wikidata":"https://www.wikidata.org/wiki/Q25305062","display_name":"Multiple Models","level":2,"score":0.3253999948501587},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.31470000743865967},{"id":"https://openalex.org/C42475967","wikidata":"https://www.wikidata.org/wiki/Q194292","display_name":"Operations research","level":1,"score":0.31279999017715454},{"id":"https://openalex.org/C20556612","wikidata":"https://www.wikidata.org/wiki/Q4469374","display_name":"Volume (thermodynamics)","level":2,"score":0.2953999936580658},{"id":"https://openalex.org/C78508483","wikidata":"https://www.wikidata.org/wiki/Q139445","display_name":"Algorithmic trading","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C131562839","wikidata":"https://www.wikidata.org/wiki/Q1574928","display_name":"Trading strategy","level":2,"score":0.27559998631477356},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.27059999108314514},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26269999146461487},{"id":"https://openalex.org/C167085575","wikidata":"https://www.wikidata.org/wiki/Q6803654","display_name":"Mean squared prediction error","level":2,"score":0.26179999113082886},{"id":"https://openalex.org/C61326573","wikidata":"https://www.wikidata.org/wiki/Q1496376","display_name":"Gaussian process","level":3,"score":0.2581000030040741},{"id":"https://openalex.org/C19244329","wikidata":"https://www.wikidata.org/wiki/Q208697","display_name":"Financial market","level":2,"score":0.25679999589920044},{"id":"https://openalex.org/C51485801","wikidata":"https://www.wikidata.org/wiki/Q16966861","display_name":"Efficient frontier","level":3,"score":0.25619998574256897},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.25440001487731934},{"id":"https://openalex.org/C67257552","wikidata":"https://www.wikidata.org/wiki/Q635217","display_name":"Probit model","level":2,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.07355","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07355","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.07355","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07355","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.7326791882514954}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"introduce":[1],"Prediction":[2,30],"Arena,":[3],"a":[4,72,121,173,218,243],"benchmark":[5],"for":[6],"evaluating":[7],"AI":[8],"models'":[9],"predictive":[10],"accuracy":[11,127],"and":[12,43,96,128,239],"decision-making":[13],"by":[14],"enabling":[15],"them":[16],"to":[17,78,116,131],"trade":[18],"autonomously":[19],"on":[20,39,133,164,168,193,197,221],"live":[21,90,156],"prediction":[22,126],"markets":[23],"with":[24,63,146,170],"real":[25,252],"capital.":[26],"Unlike":[27],"synthetic":[28],"benchmarks,":[29],"Arena":[31],"tests":[32],"models":[33,88,99,160,223,249],"in":[34,89,100,199],"environments":[35],"where":[36],"trades":[37,192],"execute":[38],"actual":[40],"exchanges":[41],"(Kalshi":[42],"Polymarket),":[44],"providing":[45,242],"objective":[46],"ground":[47],"truth":[48],"that":[49,214],"cannot":[50],"be":[51],"gamed":[52],"or":[53,184],"overfitted.":[54],"Each":[55],"model":[56,208],"operates":[57],"as":[58],"an":[59],"independent":[60],"agent":[61],"starting":[62],"$10,000,":[64],"making":[65],"autonomous":[66],"decisions":[67],"every":[68],"15-45":[69],"minutes.":[70],"Over":[71],"57-day":[73],"longitudinal":[74],"evaluation":[75],"(January":[76],"12":[77],"March":[79],"9,":[80],"2026),":[81],"we":[82,227],"track":[83],"two":[84],"cohorts:":[85],"six":[86],"frontier":[87,248],"trading":[91,102],"(Cohort":[92,103,187],"1,":[93,109],"full":[94],"period)":[95],"four":[97],"next-generation":[98],"paper":[101],"2,":[104],"3-day":[105],"preliminary).":[106],"For":[107],"Cohort":[108,158],"final":[110],"Kalshi":[111],"returns":[112],"range":[113],"from":[114,153],"-16.0%":[115],"-30.8%.":[117],"Our":[118],"analysis":[119],"identifies":[120],"clear":[122],"performance":[123],"hierarchy:":[124],"initial":[125],"the":[129,137,179,203],"ability":[130],"capitalize":[132],"correct":[134],"predictions":[135],"are":[136],"main":[138],"drivers,":[139],"while":[140],"research":[141],"volume":[142],"shows":[143],"no":[144],"correlation":[145],"outcomes.":[147],"A":[148],"striking":[149],"cross-platform":[150],"contrast":[151],"emerges":[152],"parallel":[154],"Polymarket":[155,165,198],"trading:":[157],"1":[159],"averaged":[161],"only":[162],"-1.1%":[163],"vs.":[166],"-22.6%":[167],"Kalshi,":[169,194],"grok-4-20-checkpoint":[171],"achieving":[172],"71.4%":[174],"settlement":[175,235],"win":[176],"rate":[177],"-":[178,202,212],"highest":[180],"across":[181,209],"any":[182,207],"platform":[183,215],"cohort.":[185],"gemini-3.1-pro-preview":[186],"2),":[188],"which":[189,222],"executed":[190],"zero":[191],"achieved":[195],"+6.02%":[196],"3":[200],"days":[201],"best":[204],"return":[205],"of":[206,246],"either":[210],"cohort":[211],"demonstrating":[213],"design":[216],"has":[217],"profound":[219],"effect":[220],"succeed.":[224],"Beyond":[225],"performance,":[226],"analyze":[228],"computational":[229],"efficiency":[230],"(token":[231],"usage,":[232],"cycle":[233],"time),":[234],"accuracy,":[236],"exit":[237],"patterns,":[238],"market":[240],"preferences,":[241],"comprehensive":[244],"view":[245],"how":[247],"behave":[250],"under":[251],"financial":[253],"pressure.":[254]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-11T00:00:00"}
