{"id":"https://openalex.org/W7154475718","doi":"https://doi.org/10.48550/arxiv.2604.11996","title":"Filtered Reasoning Score: Evaluating Reasoning Quality on a Model's Most-Confident Traces","display_name":"Filtered Reasoning Score: Evaluating Reasoning Quality on a Model's Most-Confident Traces","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154475718","doi":"https://doi.org/10.48550/arxiv.2604.11996"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11996","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11996","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11996","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133700270","display_name":"Manas Pathak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pathak, Manas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133646165","display_name":"Xingyao Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Xingyao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079494654","display_name":"Shuozhe Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Shuozhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133647789","display_name":"Amy Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Amy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133717321","display_name":"Liu Leqi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Leqi, Liu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.32100000977516174,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.32100000977516174,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.13840000331401825,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.0869000032544136,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/deductive-reasoning","display_name":"Deductive reasoning","score":0.6407999992370605},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6251999735832214},{"id":"https://openalex.org/keywords/case-based-reasoning","display_name":"Case-based reasoning","score":0.6205999851226807},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.6141999959945679},{"id":"https://openalex.org/keywords/reasoning-system","display_name":"Reasoning system","score":0.5468000173568726},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5461999773979187},{"id":"https://openalex.org/keywords/qualitative-reasoning","display_name":"Qualitative reasoning","score":0.5271999835968018},{"id":"https://openalex.org/keywords/model-based-reasoning","display_name":"Model-based reasoning","score":0.5248000025749207},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.4740000069141388},{"id":"https://openalex.org/keywords/verbal-reasoning","display_name":"Verbal reasoning","score":0.46299999952316284}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.660099983215332},{"id":"https://openalex.org/C97364631","wikidata":"https://www.wikidata.org/wiki/Q484284","display_name":"Deductive reasoning","level":2,"score":0.6407999992370605},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6251999735832214},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6223999857902527},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.6205999851226807},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.6141999959945679},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.5468000173568726},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5461999773979187},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.5271999835968018},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.5248000025749207},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.4740000069141388},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4690999984741211},{"id":"https://openalex.org/C36964233","wikidata":"https://www.wikidata.org/wiki/Q7920942","display_name":"Verbal reasoning","level":3,"score":0.46299999952316284},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.4555000066757202},{"id":"https://openalex.org/C103057564","wikidata":"https://www.wikidata.org/wiki/Q4751139","display_name":"Analytic reasoning","level":3,"score":0.4480000138282776},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.44749999046325684},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.43799999356269836},{"id":"https://openalex.org/C30038468","wikidata":"https://www.wikidata.org/wiki/Q4354775","display_name":"Memorization","level":2,"score":0.41269999742507935},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.40470001101493835},{"id":"https://openalex.org/C86827895","wikidata":"https://www.wikidata.org/wiki/Q7098582","display_name":"Opportunistic reasoning","level":4,"score":0.3939000070095062},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.35899999737739563},{"id":"https://openalex.org/C2985612853","wikidata":"https://www.wikidata.org/wiki/Q185816","display_name":"Analogical reasoning","level":3,"score":0.3564999997615814},{"id":"https://openalex.org/C183521366","wikidata":"https://www.wikidata.org/wiki/Q7256422","display_name":"Psychology of reasoning","level":4,"score":0.31150001287460327},{"id":"https://openalex.org/C2779346075","wikidata":"https://www.wikidata.org/wiki/Q7268763","display_name":"Quality Score","level":3,"score":0.2987000048160553},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.29490000009536743},{"id":"https://openalex.org/C43971567","wikidata":"https://www.wikidata.org/wiki/Q3142865","display_name":"Logical reasoning","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C2776325391","wikidata":"https://www.wikidata.org/wiki/Q6917865","display_name":"Motivated reasoning","level":3,"score":0.2847999930381775},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.26809999346733093},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.2648000121116638},{"id":"https://openalex.org/C107848011","wikidata":"https://www.wikidata.org/wiki/Q4680756","display_name":"Adaptive reasoning","level":4,"score":0.2615000009536743},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.2590000033378601},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.25429999828338623},{"id":"https://openalex.org/C156201811","wikidata":"https://www.wikidata.org/wiki/Q5418360","display_name":"Evidential reasoning approach","level":4,"score":0.2526000142097473}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11996","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11996","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11996","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11996","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Should":[0],"we":[1,72,78,115,176],"trust":[2],"Large":[3],"Language":[4],"Models":[5],"(LLMs)":[6],"with":[7,51,97,195,212],"high":[8,12],"accuracy?":[9],"LLMs":[10],"achieve":[11],"accuracy":[13,99,203,228,239],"on":[14,215,222],"reasoning":[15,27,54,88,118,122,185,208,224,230,245],"benchmarks,":[16,76,225],"but":[17],"correctness":[18],"alone":[19],"does":[20],"not":[21],"reveal":[22],"the":[23,26,85,156,178,189],"quality":[24,86,186],"of":[25,37,87,158],"used":[28],"to":[29,65,83,104,138,170,219],"produce":[30],"it.":[31],"This":[32],"highlights":[33],"a":[34,117,242],"fundamental":[35],"limitation":[36],"outcome-based":[38,81],"evaluation:":[39],"models":[40,50,96,197,211],"may":[41],"arrive":[42],"at":[43],"correct":[44,165],"answers":[45],"through":[46],"flawed":[47],"reasoning,":[48],"and":[49,100,109,131,163,229],"substantially":[52],"different":[53],"capabilities":[55],"can":[56,77],"nevertheless":[57],"exhibit":[58,204],"similar":[59,98],"benchmark":[60,217],"accuracy,":[61],"for":[62],"example":[63],"due":[64],"memorization":[66],"or":[67],"over-optimization.":[68],"In":[69],"this":[70,113,140],"paper,":[71],"ask:":[73],"given":[74],"existing":[75],"move":[79],"beyond":[80],"evaluation":[82,251],"assess":[84],"itself?":[89],"We":[90,247],"seek":[91],"metrics":[92],"that":[93,120,198,236],"(1)":[94],"differentiate":[95],"(2)":[101],"are":[102,167,199],"robust":[103],"variations":[105],"in":[106,152,207,226],"input":[107],"prompts":[108],"generation":[110],"configurations.":[111],"To":[112,173],"end,":[114],"propose":[116],"score":[119,141],"evaluates":[121],"traces":[123,166],"along":[124],"dimensions":[125],"such":[126],"as":[127],"faithfulness,":[128],"coherence,":[129],"utility,":[130],"factuality.":[132],"A":[133],"remaining":[134],"question":[135],"is":[136,149],"how":[137],"aggregate":[139],"across":[142],"multiple":[143],"sampled":[144],"traces.":[145,193],"Naively":[146],"averaging":[147],"them":[148],"undesirable,":[150],"particularly":[151],"long-horizon":[153],"settings,":[154],"where":[155],"number":[157],"possible":[159],"trajectories":[160],"grows":[161],"rapidly,":[162],"low-confidence":[164],"more":[168],"likely":[169],"be":[171],"coincidental.":[172],"address":[174],"this,":[175],"introduce":[177],"Filtered":[179],"Reasoning":[180],"Score":[181],"(FRS),":[182],"which":[183],"computes":[184],"using":[187],"only":[188],"top-K%":[190],"most":[191],"confident":[192],"Evaluating":[194],"FRS,":[196],"indistinguishable":[200],"under":[201],"standard":[202],"significant":[205],"differences":[206],"quality.":[209,231],"Moreover,":[210],"higher":[213],"FRS":[214,237],"one":[216],"tend":[218],"perform":[220],"better":[221],"other":[223],"both":[227],"Together,":[232],"these":[233],"findings":[234],"suggest":[235],"complements":[238],"by":[240],"capturing":[241],"model's":[243],"transferable":[244],"capabilities.":[246],"open":[248],"source":[249],"our":[250],"codebase:":[252],"https://github.com/Manas2006/benchmark_reproducibility.":[253]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-16T00:00:00"}
