{"id":"https://openalex.org/W7131094556","doi":"https://doi.org/10.48550/arxiv.2602.17831","title":"The Token Games: Evaluating Language Model Reasoning with Puzzle Duels","display_name":"The Token Games: Evaluating Language Model Reasoning with Puzzle Duels","publication_year":2026,"publication_date":"2026-02-19","ids":{"openalex":"https://openalex.org/W7131094556","doi":"https://doi.org/10.48550/arxiv.2602.17831"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.17831","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5034372053","display_name":"S. Kumar Raja","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Henniger, Simon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126643007","display_name":"Gabriel Poesia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Poesia, Gabriel","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5034372053"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.2793999910354614,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.2793999910354614,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.1290999948978424,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.07150000333786011,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6858999729156494},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.5942000150680542},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5356000065803528},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4507000148296356},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.4221000075340271},{"id":"https://openalex.org/keywords/python","display_name":"Python (programming language)","score":0.3984000086784363},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.3734000027179718},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.3416000008583069}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.791700005531311},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6858999729156494},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.5942000150680542},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5356000065803528},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5113000273704529},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4507000148296356},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.4221000075340271},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.3984000086784363},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3944000005722046},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.3734000027179718},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.36090001463890076},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.3416000008583069},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.3280999958515167},{"id":"https://openalex.org/C92548554","wikidata":"https://www.wikidata.org/wiki/Q2262868","display_name":"Domain model","level":3,"score":0.3118000030517578},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.29409998655319214},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.2919999957084656},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.2833999991416931},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.27900001406669617},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2768000066280365},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2515999972820282}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.17831","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.17831","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.17831","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.17831","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.7402163743972778,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Evaluating":[0],"the":[1,32,88,147],"reasoning":[2,49,194],"capabilities":[3],"of":[4,16,90],"Large":[5],"Language":[6],"Models":[7],"is":[8,19,39,172],"increasingly":[9],"challenging":[10,176],"as":[11,153],"models":[12,77,132,141,205],"improve.":[13],"Human":[14],"curation":[15],"hard":[17],"questions":[18,46],"highly":[20,175],"expensive,":[21],"especially":[22],"in":[23,162],"recent":[24],"benchmarks":[25,151],"using":[26],"PhD-level":[27],"domain":[28],"knowledge":[29],"to":[30,67,110,130,134],"challenge":[31,78],"most":[33],"capable":[34],"models.":[35],"Even":[36],"then,":[37],"there":[38],"always":[40],"a":[41,95,100,174],"concern":[42],"about":[43],"whether":[44],"these":[45],"test":[47],"genuine":[48],"or":[50],"if":[51],"similar":[52],"problems":[53,113],"have":[54],"been":[55],"seen":[56],"during":[57],"training.":[58],"Here,":[59],"we":[60,123],"take":[61],"inspiration":[62],"from":[63,120,149],"16th-century":[64],"mathematical":[65],"duels":[66],"design":[68],"The":[69],"Token":[70],"Games":[71],"(TTG):":[72],"an":[73],"evaluation":[74],"framework":[75],"where":[76],"each":[79,135],"other":[80,207],"by":[81,183,199],"creating":[82,163,169],"their":[83],"own":[84],"puzzles.":[85,164],"We":[86,137,165],"leverage":[87],"format":[89],"Programming":[91],"Puzzles":[92],"-":[93,109],"given":[94],"Python":[96],"function":[97],"that":[98,104,168,195,202],"returns":[99],"boolean,":[101],"find":[102,167],"inputs":[103],"make":[105],"it":[106],"return":[107],"True":[108],"flexibly":[111],"represent":[112],"and":[114,144,201,211],"enable":[115],"verifying":[116],"solutions.":[117],"Using":[118],"results":[119],"pairwise":[121],"duels,":[122],"then":[124],"compute":[125],"Elo":[126],"ratings,":[127],"allowing":[128],"us":[129],"compare":[131],"relative":[133],"other.":[136],"evaluate":[138],"10":[139],"frontier":[140],"on":[142],"TTG,":[143],"closely":[145],"match":[146],"ranking":[148],"existing":[150],"such":[152],"Humanity's":[154],"Last":[155],"Exam,":[156],"without":[157],"involving":[158],"any":[159],"human":[160],"effort":[161],"also":[166],"good":[170],"puzzles":[171],"still":[173],"task":[177,212],"for":[178,192,206],"current":[179],"models,":[180],"not":[181],"measured":[182],"previous":[184],"benchmarks.":[185],"Overall,":[186],"our":[187],"work":[188],"suggests":[189],"new":[190],"paradigms":[191],"evaluating":[193],"cannot":[196],"be":[197],"saturated":[198],"design,":[200],"allow":[203],"testing":[204],"skills":[208],"like":[209],"creativity":[210],"creation":[213],"alongside":[214],"problem":[215],"solving.":[216]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-24T00:00:00"}
