{"id":"https://openalex.org/W7161721389","doi":"https://doi.org/10.48550/arxiv.2605.17278","title":"A2RBench: An Automatic Paradigm for Formally Verifiable Abstract Reasoning Benchmark Generation","display_name":"A2RBench: An Automatic Paradigm for Formally Verifiable Abstract Reasoning Benchmark Generation","publication_year":2026,"publication_date":"2026-05-17","ids":{"openalex":"https://openalex.org/W7161721389","doi":"https://doi.org/10.48550/arxiv.2605.17278"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.17278","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.17278","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.17278","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5084762321","display_name":"Qingchuan Ma","orcid":"https://orcid.org/0000-0001-8061-6320"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Qingchuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136466128","display_name":"Yuexiao Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Yuexiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133170242","display_name":"Yongkang Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Yongkang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136491857","display_name":"Tianyu Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Tianyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054226277","display_name":"Xiawu Zheng","orcid":"https://orcid.org/0000-0002-6855-5403"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Xiawu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136492270","display_name":"Rongrong Ji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Rongrong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.13989999890327454,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.13989999890327454,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.07209999859333038,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.06260000169277191,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5443000197410583},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5249999761581421},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5006999969482422},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.4846999943256378},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.45080000162124634},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.4447000026702881},{"id":"https://openalex.org/keywords/mainstream","display_name":"Mainstream","score":0.41589999198913574}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5989999771118164},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5443000197410583},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5249999761581421},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5006999969482422},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.4846999943256378},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.45080000162124634},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.4447000026702881},{"id":"https://openalex.org/C2777617010","wikidata":"https://www.wikidata.org/wiki/Q18957","display_name":"Mainstream","level":2,"score":0.41589999198913574},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39719998836517334},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.3767000138759613},{"id":"https://openalex.org/C30038468","wikidata":"https://www.wikidata.org/wiki/Q4354775","display_name":"Memorization","level":2,"score":0.37389999628067017},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.34310001134872437},{"id":"https://openalex.org/C44725695","wikidata":"https://www.wikidata.org/wiki/Q288156","display_name":"Normative","level":2,"score":0.3312000036239624},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.31380000710487366},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.30149999260902405},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.29989999532699585},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2533999979496002}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.17278","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.17278","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.17278","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.17278","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Abstract":[0],"reasoning":[1,190],"ability":[2,21],"reflects":[3],"the":[4,61,72,112,117,166,189],"intelligence":[5],"and":[6,13,57,79,106,163],"generalization":[7],"capacity":[8],"of":[9,161,168,175,177],"LLMs":[10,64,75,135,157],"to":[11,84],"extract":[12],"apply":[14],"abstract":[15,140],"rules.":[16],"However,":[17,90],"accurately":[18],"measuring":[19,37],"this":[20],"remains":[22],"challenging:":[23],"existing":[24],"benchmarks":[25],"either":[26],"rely":[27],"on":[28,128,148],"expensive":[29],"manual":[30],"annotation,":[31],"limiting":[32],"their":[33,173],"scale,":[34],"or":[35],"risk":[36],"memorization":[38],"rather":[39],"than":[40],"genuine":[41,69],"reasoning.":[42],"To":[43,97],"address":[44],"this,":[45],"we":[46,100,131],"introduce":[47],"an":[48],"automated":[49],"pipeline":[50],"named":[51],"A2RBench,":[52],"encompassing":[53],"generation,":[54],"expansion,":[55],"evaluation,":[56],"analysis.":[58],"Specifically,":[59],"in":[60,71,139,165],"generation":[62],"stage,":[63,74],"create":[65],"diverse":[66],"tasks":[67],"demanding":[68],"reasoning;":[70],"expansion":[73],"reuse":[76],"validated":[77],"rules":[78],"expand":[80],"new":[81],"input":[82],"spaces":[83],"generate":[85],"task":[86],"variations,":[87],"achieving":[88],"scaling.":[89],"such":[91],"a":[92,103,122,149],"process":[93],"may":[94],"cause":[95],"hallucinations.":[96],"eliminate":[98],"it,":[99],"further":[101],"establish":[102],"theoretical":[104],"framework":[105],"prove":[107],"that":[108],"programmatic":[109],"verification--testing":[110],"whether":[111],"inverse":[113],"operation":[114,119],"perfectly":[115],"reverses":[116],"forward":[118],"(cycle":[120],"consistency)--guarantees":[121],"unique":[123],"solution.":[124],"Through":[125],"extensive":[126],"evaluations":[127],"mainstream":[129],"LLMs,":[130],"find:":[132],"(1)":[133],"Current":[134,156],"exhibit":[136],"fundamental":[137],"deficiencies":[138],"reasoning,":[141],"with":[142,183],"top":[143],"models":[144],"significantly":[145],"underperforming":[146],"humans":[147],"representative":[150],"subset":[151],"(39.8%":[152],"vs.":[153],"68.5%).":[154],"(2)":[155],"fall":[158],"far":[159],"short":[160],"2D":[162],"1D":[164],"complexity":[167,186],"generated":[169],"3D":[170],"tasks,":[171],"revealing":[172],"lack":[174],"understanding":[176],"high-dimensional":[178],"tasks.":[179],"(3)":[180],"Counterintuitively,":[181],"inputs":[182],"higher":[184],"information":[185],"can":[187],"simplify":[188],"process.":[191]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
