{"id":"https://openalex.org/W7134915325","doi":"https://doi.org/10.48550/arxiv.2603.09678","title":"EsoLang-Bench: Evaluating Genuine Reasoning in Large Language Models via Esoteric Programming Languages","display_name":"EsoLang-Bench: Evaluating Genuine Reasoning in Large Language Models via Esoteric Programming Languages","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7134915325","doi":"https://doi.org/10.48550/arxiv.2603.09678"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.09678","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09678","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.09678","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128726619","display_name":"Aman Sharma","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sharma, Aman","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128756557","display_name":"Paras Chopra","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chopra, Paras","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5128726619"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.19009999930858612,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.19009999930858612,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.14980000257492065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.09989999979734421,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5620999932289124},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.46939998865127563},{"id":"https://openalex.org/keywords/python","display_name":"Python (programming language)","score":0.44679999351501465},{"id":"https://openalex.org/keywords/second-generation-programming-language","display_name":"Second-generation programming language","score":0.4081000089645386},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3831000030040741},{"id":"https://openalex.org/keywords/programming-paradigm","display_name":"Programming paradigm","score":0.3659999966621399},{"id":"https://openalex.org/keywords/sketch","display_name":"Sketch","score":0.35420000553131104},{"id":"https://openalex.org/keywords/lexicographical-order","display_name":"Lexicographical order","score":0.33500000834465027}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7781999707221985},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5620999932289124},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5475000143051147},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.49459999799728394},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.46939998865127563},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.44679999351501465},{"id":"https://openalex.org/C11164408","wikidata":"https://www.wikidata.org/wiki/Q18657800","display_name":"Second-generation programming language","level":4,"score":0.4081000089645386},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3831000030040741},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.3659999966621399},{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.35420000553131104},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3418000042438507},{"id":"https://openalex.org/C159254197","wikidata":"https://www.wikidata.org/wiki/Q1144915","display_name":"Lexicographical order","level":2,"score":0.33500000834465027},{"id":"https://openalex.org/C199305712","wikidata":"https://www.wikidata.org/wiki/Q3205914","display_name":"Fifth-generation programming language","level":3,"score":0.3287000060081482},{"id":"https://openalex.org/C122783720","wikidata":"https://www.wikidata.org/wiki/Q183065","display_name":"Interpreter","level":2,"score":0.3084000051021576},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.30790001153945923},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3066999912261963},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3043000102043152},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.30070000886917114},{"id":"https://openalex.org/C94922259","wikidata":"https://www.wikidata.org/wiki/Q33215","display_name":"Constructed language","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.2757999897003174},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.26980000734329224},{"id":"https://openalex.org/C75608658","wikidata":"https://www.wikidata.org/wiki/Q44395","display_name":"Pascal (unit)","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2572999894618988},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.2529999911785126},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.09678","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09678","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.09678","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09678","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.783233106136322,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,161,217],"achieve":[3],"near-ceiling":[4],"performance":[5,47],"on":[6,48,184],"code":[7],"generation":[8],"benchmarks,":[9],"yet":[10],"most":[11],"of":[12,70,95,115],"the":[13,78,121,125,189],"programming":[14,50,60,222],"languages":[15,61,74,117,223],"used":[16],"by":[17],"popular":[18],"benchmarks":[19],"such":[20],"as":[21],"SWE-bench":[22],"and":[23,38,66,166,198],"HumanEval":[24],"(e.g.":[25],"Python,":[26],"JavaScript)":[27],"are":[28,39,75,83,89,99],"squarely":[29],"in-distribution.":[30],"They":[31],"appear":[32],"at":[33],"scale":[34],"in":[35,85,90,93,131,152,177],"pre-training":[36,132],"corpora":[37,133],"heavily":[40],"reinforced":[41],"during":[42],"post-training.":[43],"To":[44],"study":[45],"LLM":[46],"unfamiliar":[49,100],"languages,":[51],"we":[52],"introduce":[53],"EsoLang-Bench,":[54],"a":[55,106,168,209],"benchmark":[56],"using":[57],"five":[58,69,159,163],"esoteric":[59,73,116,191],"(Brainfuck,":[62],"Befunge-98,":[63],"Whitespace,":[64],"Unlambda,":[65],"Shakespeare).":[67],"All":[68],"our":[71],"chosen":[72],"Turing-complete,":[76],"so":[77],"same":[79,173],"algorithmic":[80,219],"problems":[81,175],"that":[82],"solvable":[84,92],"Python":[86,178],"or":[87,179],"JavaScript":[88,180],"principle":[91],"each":[94],"them.":[96],"Yet,":[97],"they":[98],"to":[101,135,202,221],"LLMs":[102],"which":[103,148],"makes":[104,149],"them":[105],"good":[107],"proxy":[108],"for":[109,212],"evaluating":[110],"out-of-distribution":[111],"performance.":[112],"The":[113,172],"unfamiliarity":[114],"comprises":[118],"of:":[119],"(i)":[120],"hard-by-design":[122],"primitives":[123],"comprising":[124],"language;":[126],"(ii)":[127],"substantially":[128],"less":[129],"representation":[130],"(340x":[134],"over":[136],"60,000x":[137],"fewer":[138],"public":[139],"GitHub":[140],"repositories":[141],"than":[142],"Python);":[143],"(iii)":[144],"negligible":[145],"deployment":[146],"value,":[147],"targeted":[150],"inclusion":[151],"post-training":[153],"data":[154],"economically":[155],"irrational.":[156],"We":[157],"evaluate":[158],"frontier":[160,186,216],"across":[162],"prompting":[164],"strategies":[165],"find":[167],"dramatic":[169],"capability":[170],"gap.":[171,205],"80":[174],"expressed":[176],"reach":[181],"100%":[182],"accuracy":[183],"top":[185],"models,":[187],"while":[188],"equivalent":[190],"versions":[192],"score":[193],"only":[194],"0-11%.":[195],"Few-shot":[196],"learning":[197],"self-reflection":[199],"also":[200],"fail":[201],"close":[203],"this":[204],"EsoLang-Bench":[206],"therefore":[207],"provides":[208],"contamination-resistant":[210],"testbed":[211],"measuring":[213],"how":[214],"well":[215],"generalise":[218],"problem-solving":[220],"outside":[224],"their":[225],"training":[226],"distribution.":[227]},"counts_by_year":[],"updated_date":"2026-05-14T06:09:40.864956","created_date":"2026-03-12T00:00:00"}
