{"id":"https://openalex.org/W7125152669","doi":"https://doi.org/10.48550/arxiv.2601.13545","title":"TruthTensor: Evaluating LLMs through Human Imitation on Prediction Market under Drift and Holistic Reasoning","display_name":"TruthTensor: Evaluating LLMs through Human Imitation on Prediction Market under Drift and Holistic Reasoning","publication_year":2026,"publication_date":"2026-01-20","ids":{"openalex":"https://openalex.org/W7125152669","doi":"https://doi.org/10.48550/arxiv.2601.13545"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.13545","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.13545","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.13545","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123461548","display_name":"Shirin Shahabi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shahabi, Shirin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123532115","display_name":"Spencer Graham","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Graham, Spencer","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123510294","display_name":"Haruna Isah","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Isah, Haruna","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.34540000557899475,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.34540000557899475,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.07339999824762344,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.05810000002384186,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.896399974822998},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.671999990940094},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6049000024795532},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.48339998722076416},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.4593999981880188},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4343999922275543},{"id":"https://openalex.org/keywords/imitation","display_name":"Imitation","score":0.41359999775886536},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.3228999972343445}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.896399974822998},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.671999990940094},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.647599995136261},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6344000101089478},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6049000024795532},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6040999889373779},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.48339998722076416},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.4593999981880188},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4343999922275543},{"id":"https://openalex.org/C126388530","wikidata":"https://www.wikidata.org/wiki/Q1131737","display_name":"Imitation","level":2,"score":0.41359999775886536},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.3546000123023987},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.3228999972343445},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.31769999861717224},{"id":"https://openalex.org/C168725872","wikidata":"https://www.wikidata.org/wiki/Q991663","display_name":"Sophistication","level":2,"score":0.30979999899864197},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.30469998717308044},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.3021000027656555},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.28600001335144043},{"id":"https://openalex.org/C109747225","wikidata":"https://www.wikidata.org/wiki/Q815758","display_name":"Scarcity","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2687999904155731},{"id":"https://openalex.org/C73301696","wikidata":"https://www.wikidata.org/wiki/Q5469984","display_name":"Formalism (music)","level":3,"score":0.26600000262260437},{"id":"https://openalex.org/C2780898871","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Performance metric","level":2,"score":0.2515999972820282},{"id":"https://openalex.org/C60777511","wikidata":"https://www.wikidata.org/wiki/Q3045002","display_name":"Concept drift","level":3,"score":0.25}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.13545","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.13545","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.13545","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.13545","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5119128823280334,"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Evaluating":[0],"language":[1],"models":[2,44,131,149],"and":[3,19,26,72,92,107,114,142,158,179],"AI":[4],"agents":[5],"remains":[6],"fundamentally":[7],"challenging":[8],"because":[9],"static":[10],"benchmarks":[11],"fail":[12],"to":[13,68,76,111,147,184],"capture":[14],"real-world":[15,191],"uncertainty,":[16],"distribution":[17],"shift,":[18],"the":[20,145],"gap":[21],"between":[22],"isolated":[23],"task":[24],"accuracy":[25,135],"human-aligned":[27],"decision-making":[28],"under":[29],"evolving":[30],"conditions.":[31],"This":[32],"paper":[33],"introduces":[34],"TruthTensor,":[35],"a":[36,78],"novel,":[37],"reproducible":[38],"evaluation":[39,67,103,165,182],"paradigm":[40],"that":[41,130],"measures":[42],"reasoning":[43],"not":[45],"only":[46],"as":[47,51],"prediction":[48,70],"engines":[49],"but":[50],"human-imitation":[52],"systems":[53],"operating":[54],"in":[55,139,190],"socially-grounded,":[56],"high-entropy":[57],"environments.":[58],"Building":[59],"on":[60],"forward-looking,":[61],"contamination-free":[62],"tasks,":[63],"our":[64],"framework":[65],"anchors":[66],"live":[69],"markets":[71,123],"combines":[73],"probabilistic":[74],"scoring":[75],"provide":[77],"holistic":[79],"view":[80],"of":[81,116,188],"model":[82],"behavior.":[83],"TruthTensor":[84,128,161,197],"complements":[85],"traditional":[86],"correctness":[87],"metrics":[88],"with":[89,132],"drift-centric":[90],"diagnostics":[91],"explicit":[93],"robustness":[94],"checks":[95],"for":[96],"reproducibility.":[97],"It":[98],"specify":[99],"human":[100],"vs.":[101],"automated":[102],"roles,":[104],"annotation":[105],"protocols,":[106],"statistical":[108],"testing":[109],"procedures":[110],"ensure":[112],"interpretability":[113],"replicability":[115],"results.":[117],"In":[118],"experiments":[119],"across":[120],"500+":[121],"real":[122],"(political,":[124],"economic,":[125],"cultural,":[126],"technological),":[127],"demonstrates":[129],"similar":[133],"forecast":[134],"can":[136],"diverge":[137],"markedly":[138],"calibration,":[140,154],"drift,":[141],"risk-sensitivity,":[143],"underscoring":[144],"need":[146],"evaluate":[148],"along":[150],"multiple":[151],"axes":[152],"(accuracy,":[153],"narrative":[155],"stability,":[156],"cost,":[157],"resource":[159],"efficiency).":[160],"therefore":[162],"operationalizes":[163],"modern":[164],"best":[166],"practices,":[167],"clear":[168],"hypothesis":[169],"framing,":[170],"careful":[171],"metric":[172],"selection,":[173],"transparent":[174],"compute/cost":[175],"reporting,":[176],"human-in-the-loop":[177],"validation,":[178],"open,":[180],"versioned":[181],"contracts,":[183],"produce":[185],"defensible":[186],"assessments":[187],"LLMs":[189],"decision":[192],"contexts.":[193],"We":[194],"publicly":[195],"released":[196],"at":[198],"https://truthtensor.com.":[199]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-22T00:00:00"}
