{"id":"https://openalex.org/W7135048763","doi":"https://doi.org/10.48550/arxiv.2603.10960","title":"Ranking Reasoning LLMs under Test-Time Scaling","display_name":"Ranking Reasoning LLMs under Test-Time Scaling","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7135048763","doi":"https://doi.org/10.48550/arxiv.2603.10960"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.10960","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10960","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.10960","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113375240","display_name":"Mohsen Hariri","orcid":"https://orcid.org/0000-0003-0731-4275"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hariri, Mohsen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051732988","display_name":"Michael Hinczewski","orcid":"https://orcid.org/0000-0003-2837-7697"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hinczewski, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128806157","display_name":"Jing Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128874911","display_name":"Vipin Chaudhary","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chaudhary, Vipin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5113375240"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10467","display_name":"Psychometric Methodologies and Testing","score":0.2531000077724457,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10467","display_name":"Psychometric Methodologies and Testing","score":0.2531000077724457,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.12770000100135803,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.0934000015258789,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.8206999897956848},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6424000263214111},{"id":"https://openalex.org/keywords/bayesian-probability","display_name":"Bayesian probability","score":0.5533000230789185},{"id":"https://openalex.org/keywords/voting","display_name":"Voting","score":0.4999000132083893},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.4909000098705292},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.4758000075817108},{"id":"https://openalex.org/keywords/variance","display_name":"Variance (accounting)","score":0.4408999979496002},{"id":"https://openalex.org/keywords/greedy-algorithm","display_name":"Greedy algorithm","score":0.3978999853134155}],"concepts":[{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.8206999897956848},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6424000263214111},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.5533000230789185},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5205000042915344},{"id":"https://openalex.org/C520049643","wikidata":"https://www.wikidata.org/wiki/Q189760","display_name":"Voting","level":3,"score":0.4999000132083893},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.4909000098705292},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.4758000075817108},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.4408999979496002},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4011000096797943},{"id":"https://openalex.org/C51823790","wikidata":"https://www.wikidata.org/wiki/Q504353","display_name":"Greedy algorithm","level":2,"score":0.3978999853134155},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.3921000063419342},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.37220001220703125},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3653999865055084},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.35740000009536743},{"id":"https://openalex.org/C160234255","wikidata":"https://www.wikidata.org/wiki/Q812535","display_name":"Bayesian inference","level":3,"score":0.3564999997615814},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.34200000762939453},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.3377000093460083},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.31360000371932983},{"id":"https://openalex.org/C124975894","wikidata":"https://www.wikidata.org/wiki/Q7293290","display_name":"Ranking SVM","level":3,"score":0.3084000051021576},{"id":"https://openalex.org/C101112237","wikidata":"https://www.wikidata.org/wiki/Q4874481","display_name":"Bayesian statistics","level":4,"score":0.3034999966621399},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.2955000102519989},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.29420000314712524},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.28859999775886536},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.2822999954223633},{"id":"https://openalex.org/C150921843","wikidata":"https://www.wikidata.org/wiki/Q1170431","display_name":"Resampling","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C129848803","wikidata":"https://www.wikidata.org/wiki/Q2564360","display_name":"Sample size determination","level":2,"score":0.26019999384880066}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.10960","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10960","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.10960","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10960","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8","score":0.452282190322876}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Test-time":[0],"scaling":[1,26],"evaluates":[2],"reasoning":[3,55],"LLMs":[4],"by":[5,118],"sampling":[6,128],"multiple":[7],"outputs":[8],"per":[9],"prompt,":[10],"but":[11,120],"ranking":[12,23,35,134],"models":[13,56],"in":[14],"this":[15],"regime":[16],"remains":[17],"underexplored.":[18],"We":[19,143],"formalize":[20],"dense":[21],"benchmark":[22],"under":[24],"test-time":[25,141],"and":[27,48,50,65,87,126,139],"introduce":[28],"Scorio,":[29],"a":[30],"library":[31,149],"that":[32],"implements":[33],"statistical":[34],"methods":[36,89,101,135],"such":[37],"as":[38,109,146],"paired-comparison":[39],"models,":[40,45],"item":[41],"response":[42],"theory":[43],"(IRT)":[44],"voting":[46],"rules,":[47],"graph-":[49],"spectral-based":[51],"methods.":[52],"Across":[53],"$20$":[54],"on":[57],"four":[58],"Olympiad-style":[59],"math":[60],"benchmarks":[61],"(AIME'24,":[62],"AIME'25,":[63],"HMMT'25,":[64],"BrUMO'25;":[66],"up":[67],"to":[68],"$N=80$":[69],"trials),":[70],"most":[71],"full-trial":[72],"rankings":[73,123],"agree":[74],"closely":[75],"with":[76],"the":[77,92,96,99],"Bayesian":[78],"gold":[79],"standard":[80],"$\\mathrm{Bayes}_{\\mathcal{U}}@80$":[81],"(mean":[82],"Kendall's":[83],"$\u03c4_b":[84,103],"=":[85],"0.93$--$0.95$),":[86],"$19$--$34$":[88],"recover":[90],"exactly":[91],"same":[93],"ordering.":[94],"In":[95],"single-trial":[97],"regime,":[98],"best":[100],"reach":[102],"\\approx":[104],"0.86$.":[105],"Using":[106],"greedy":[107,125],"decoding":[108],"an":[110,147],"empirical":[111],"prior":[112],"($\\mathrm{Bayes}_{\\mathbf{R}_0}@N$)":[113],"reduces":[114],"variance":[115],"at":[116,150],"$N=1$":[117],"$16$--$52\\%$,":[119],"can":[121],"bias":[122],"when":[124],"stochastic":[127],"disagree.":[129],"These":[130],"results":[131],"identify":[132],"reliable":[133],"for":[136],"both":[137],"high-":[138],"low-budget":[140],"scaling.":[142],"release":[144],"Scorio":[145],"open-source":[148],"https://github.com/mohsenhariri/scorio.":[151]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-03-13T00:00:00"}
