{"id":"https://openalex.org/W7137233892","doi":"https://doi.org/10.48550/arxiv.2603.12744","title":"TaoBench: Do Automated Theorem Prover LLMs Generalize Beyond MathLib?","display_name":"TaoBench: Do Automated Theorem Prover LLMs Generalize Beyond MathLib?","publication_year":2026,"publication_date":"2026-03-13","ids":{"openalex":"https://openalex.org/W7137233892","doi":"https://doi.org/10.48550/arxiv.2603.12744"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.12744","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12744","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.12744","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129564386","display_name":"Alexander K Taylor","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Taylor, Alexander K","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100342745","display_name":"Junyi Zhang","orcid":"https://orcid.org/0000-0001-8986-6588"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Junyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120683660","display_name":"Ethan Ji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Ethan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129634251","display_name":"Vigyan Sahai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sahai, Vigyan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129506858","display_name":"Haikang Deng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deng, Haikang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129425364","display_name":"Yuanzhou Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yuanzhou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100566962","display_name":"Yifan Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Yifan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129629158","display_name":"Di Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Di","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129553007","display_name":"Jia-Chen Gu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Jia-Chen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129441575","display_name":"Kai-Wei Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chang, Kai-Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030248499","display_name":"Nanyun Peng","orcid":"https://orcid.org/0000-0002-8509-6595"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Nanyun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124969487","display_name":"Amit Sahai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sahai, Amit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129452672","display_name":"Wang, Wei Neng; Chang, Juin Jen; Shieh, Jhy Yuan; Chen, Jhy Hwa","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Wei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5129564386"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13523","display_name":"Mathematics, Computing, and Information Processing","score":0.5751000046730042,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13523","display_name":"Mathematics, Computing, and Information Processing","score":0.5751000046730042,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10126","display_name":"Logic, programming, and type systems","score":0.125900000333786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11435","display_name":"Polynomial and algebraic computation","score":0.08190000057220459,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bespoke","display_name":"Bespoke","score":0.8787000179290771},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6122999787330627},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.557200014591217},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.5333999991416931},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.513700008392334},{"id":"https://openalex.org/keywords/automated-theorem-proving","display_name":"Automated theorem proving","score":0.4731000065803528},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.45989999175071716},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4228000044822693}],"concepts":[{"id":"https://openalex.org/C44210515","wikidata":"https://www.wikidata.org/wiki/Q16968978","display_name":"Bespoke","level":2,"score":0.8787000179290771},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6122999787330627},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5774999856948853},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.557200014591217},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.5333999991416931},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.513700008392334},{"id":"https://openalex.org/C206880738","wikidata":"https://www.wikidata.org/wiki/Q431667","display_name":"Automated theorem proving","level":2,"score":0.4731000065803528},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.45989999175071716},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4228000044822693},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.41819998621940613},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.3237999975681305},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3215000033378601},{"id":"https://openalex.org/C159718280","wikidata":"https://www.wikidata.org/wiki/Q5526353","display_name":"Gas meter prover","level":3,"score":0.3012000024318695},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.30090001225471497},{"id":"https://openalex.org/C202269582","wikidata":"https://www.wikidata.org/wiki/Q2644277","display_name":"Complementarity (molecular biology)","level":2,"score":0.29660001397132874},{"id":"https://openalex.org/C129916263","wikidata":"https://www.wikidata.org/wiki/Q1141183","display_name":"Backward chaining","level":4,"score":0.29600000381469727},{"id":"https://openalex.org/C2780366209","wikidata":"https://www.wikidata.org/wiki/Q5170200","display_name":"Core model","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2815999984741211},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.2766000032424927},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.2759000062942505},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.259799987077713},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25459998846054077},{"id":"https://openalex.org/C122770356","wikidata":"https://www.wikidata.org/wiki/Q1656753","display_name":"Identifiability","level":2,"score":0.2515999972820282}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.12744","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12744","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.12744","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12744","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Automated":[0],"theorem":[1],"proving":[2],"(ATP)":[3],"benchmarks":[4],"largely":[5],"consist":[6],"of":[7,49,133,169],"problems":[8,68],"formalized":[9],"in":[10],"MathLib,":[11],"so":[12],"current":[13,50],"ATP":[14,51,156],"training":[15],"and":[16,31,69,107,202,204,211],"evaluation":[17],"are":[18],"heavily":[19],"biased":[20],"toward":[21],"MathLib's":[22],"definitional":[23,58,134,188],"framework.":[24],"However,":[25],"frontier":[26],"mathematics":[27],"is":[28,184],"often":[29],"exploratory":[30],"prototype-heavy,":[32],"relying":[33,96],"on":[34,97,172],"bespoke":[35,70],"constructions":[36],"that":[37,118,180],"deviate":[38],"from":[39,80,93],"standard":[40,66,98],"libraries.":[41],"In":[42],"this":[43],"work,":[44],"we":[45,113,136],"evaluate":[46],"the":[47,62,131,161,173,181],"robustness":[48],"systems":[52],"when":[53],"applied":[54],"to":[55],"a":[56,121,142,197,206],"novel":[57],"framework,":[59,163],"specifically":[60],"examining":[61],"performance":[63,164,201],"gap":[64,198],"between":[65,199],"library":[67],"mathematical":[71,91],"constructions.":[72,109],"We":[73],"introduce":[74],"TaoBench,":[75],"an":[76,115,167],"undergraduate-level":[77],"benchmark":[78,200],"derived":[79],"Terence":[81],"Tao's":[82],"Analysis":[83],"I,":[84],"which":[85],"formalizes":[86],"analysis":[87],"by":[88,104,166],"constructing":[89],"core":[90],"concepts":[92],"scratch,":[94],"without":[95],"Mathlib":[99,145],"definitions,":[100],"as":[101,103],"well":[102],"mixing":[105],"from-scratch":[106],"MathLib":[108,162],"For":[110],"fair":[111],"evaluation,":[112],"build":[114],"agentic":[116],"pipeline":[117],"automatically":[119],"extracts":[120],"compilable,":[122],"self-contained":[123],"local":[124],"environment":[125],"for":[126,151,209],"each":[127],"problem.":[128],"To":[129],"isolate":[130],"effect":[132],"frameworks,":[135],"additionally":[137],"translate":[138],"every":[139],"problem":[140],"into":[141],"mathematically":[143],"equivalent":[144,175],"formulation,":[146],"yielding":[147],"paired":[148],"TaoBench-Mathlib":[149],"statements":[150],"direct":[152],"comparison.":[153],"While":[154],"state-of-the-art":[155],"models":[157],"perform":[158],"capably":[159],"within":[160],"drops":[165],"average":[168],"roughly":[170],"26%":[171],"definitionally":[174],"Tao":[176],"formulation.":[177],"This":[178],"indicates":[179],"main":[182],"bottleneck":[183],"limited":[185],"generalization":[186],"across":[187],"frameworks":[189],"rather":[190],"than":[191],"task":[192],"difficulty.":[193],"TaoBench":[194],"thus":[195],"highlights":[196],"applicability,":[203],"provides":[205],"concrete":[207],"foundation":[208],"developing":[210],"testing":[212],"provers":[213],"better":[214],"aligned":[215],"with":[216],"research":[217],"mathematics.":[218]},"counts_by_year":[],"updated_date":"2026-03-17T07:05:13.627479","created_date":"2026-03-17T00:00:00"}
