{"id":"https://openalex.org/W7133338800","doi":"https://doi.org/10.48550/arxiv.2603.02203","title":"Tool Verification for Test-Time Reinforcement Learning","display_name":"Tool Verification for Test-Time Reinforcement Learning","publication_year":2026,"publication_date":"2026-03-02","ids":{"openalex":"https://openalex.org/W7133338800","doi":"https://doi.org/10.48550/arxiv.2603.02203"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.02203","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02203","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.02203","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5127990952","display_name":"Ruotong Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liao, Ruotong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119990135","display_name":"Nikolai R\u00f6hrich","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"R\u00f6hrich, Nikolai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127947596","display_name":"Xiaohan Wang (691917)","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xiaohan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127939370","display_name":"Yuhui Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yuhui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127908709","display_name":"Yasaman Samadzadeh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Samadzadeh, Yasaman","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128001805","display_name":"Volker Tresp","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tresp, Volker","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5093429554","display_name":"Serena Yeung-Levy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yeung-Levy, Serena","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5127990952"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.2522999942302704,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.2522999942302704,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11975","display_name":"Evolutionary Algorithms and Applications","score":0.14630000293254852,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.07980000227689743,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.8400999903678894},{"id":"https://openalex.org/keywords/spurious-relationship","display_name":"Spurious relationship","score":0.7646999955177307},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.5367000102996826},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5055000185966492},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.47749999165534973},{"id":"https://openalex.org/keywords/reinforcement","display_name":"Reinforcement","score":0.47290000319480896}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.8400999903678894},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7717000246047974},{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.7646999955177307},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.5367000102996826},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5325000286102295},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5055000185966492},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.47749999165534973},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.47290000319480896},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.41499999165534973},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.41429999470710754},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.38019999861717224},{"id":"https://openalex.org/C66283442","wikidata":"https://www.wikidata.org/wiki/Q1389268","display_name":"Failure mode and effects analysis","level":2,"score":0.35019999742507935},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.3264000117778778},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2989000082015991},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.28290000557899475}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.02203","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02203","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.02203","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02203","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Test-time":[0],"reinforcement":[1],"learning":[2],"(TTRL)":[3],"has":[4],"emerged":[5],"as":[6,76,126,135],"a":[7,30,38,70,87,136],"promising":[8],"paradigm":[9],"for":[10,57,94,139],"self-evolving":[11],"large":[12],"reasoning":[13],"models":[14],"(LRMs),":[15],"enabling":[16],"online":[17,128],"adaptation":[18],"on":[19,117],"unlabeled":[20],"test":[21],"inputs":[22],"via":[23],"self-induced":[24],"rewards":[25],"through":[26],"majority":[27],"voting.":[28],"However,":[29],"spurious":[31],"yet":[32],"high-frequency":[33],"unverified":[34],"consensus":[35],"can":[36,123],"become":[37],"biased":[39],"and":[40,102,105],"reinforced":[41],"reward":[42,67],"signal,":[43],"leading":[44],"to":[45,82],"incorrect":[46],"mode":[47,53],"collapse.":[48],"We":[49],"address":[50],"this":[51],"failure":[52],"with":[54,114],"T^3RL":[55,109,122],"(Tool-Verification":[56],"Test-Time":[58],"Reinforcement":[59],"Learning),":[60],"which":[61],"introduces":[62],"test-time":[63,132],"tool":[64,75,133],"verification":[65,134],"into":[66],"estimation.":[68],"Concretely,":[69],"verifier":[71],"uses":[72],"an":[73],"external":[74],"evidence":[77],"(e.g.,":[78],"from":[79],"code":[80],"execution)":[81],"upweight":[83],"verified":[84,127],"rollouts":[85],"in":[86],"verification-aware":[88],"voting,":[89],"producing":[90],"more":[91],"reliable":[92],"pseudo-labels":[93],"training.":[95],"Across":[96],"various":[97],"math":[98],"difficulties":[99],"(MATH-500,":[100],"AMC,":[101],"AIME":[103],"2024)":[104],"diverse":[106],"backbone":[107],"types,":[108],"significantly":[110],"improves":[111],"over":[112],"TTRL,":[113],"larger":[115],"gains":[116],"harder":[118],"problems.":[119],"More":[120],"broadly,":[121],"be":[124],"viewed":[125],"data":[129],"synthesis,":[130],"highlighting":[131],"key":[137],"mechanism":[138],"stabilizing":[140],"self-evolution.":[141]},"counts_by_year":[],"updated_date":"2026-03-04T07:09:34.246503","created_date":"2026-03-04T00:00:00"}
