{"id":"https://openalex.org/W7134065747","doi":"https://doi.org/10.48550/arxiv.2603.04738","title":"IF-RewardBench: Benchmarking Judge Models for Instruction-Following Evaluation","display_name":"IF-RewardBench: Benchmarking Judge Models for Instruction-Following Evaluation","publication_year":2026,"publication_date":"2026-03-05","ids":{"openalex":"https://openalex.org/W7134065747","doi":"https://doi.org/10.48550/arxiv.2603.04738"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.04738","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5127941245","display_name":"Bosi Wen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wen, Bosi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128240640","display_name":"Yilin Niu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Niu, Yilin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128252368","display_name":"Cunxiang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Cunxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128220610","display_name":"Xiaoying Ling","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ling, Xiaoying","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128272035","display_name":"Ying Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ying","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104345548","display_name":"Pei Ke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ke, Pei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128264772","display_name":"Hongning Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Hongning","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128264261","display_name":"Minlie Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Minlie","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5127941245"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.23010000586509705,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.23010000586509705,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.19910000264644623,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.09650000184774399,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.8047999739646912},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.7793999910354614},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6359000205993652},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4794999957084656},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.448199987411499},{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.435699999332428},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4341000020503998},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.4115999937057495}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.8047999739646912},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.7793999910354614},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7253999710083008},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6359000205993652},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5533000230789185},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4794999957084656},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4749999940395355},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.448199987411499},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44110000133514404},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.435699999332428},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4341000020503998},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.4115999937057495},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4108000099658966},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.37540000677108765},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.34450000524520874},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.319599986076355},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.3057999908924103},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.29429998993873596},{"id":"https://openalex.org/C31266012","wikidata":"https://www.wikidata.org/wiki/Q6554340","display_name":"Linkage (software)","level":3,"score":0.2892000079154968},{"id":"https://openalex.org/C197947376","wikidata":"https://www.wikidata.org/wiki/Q5155608","display_name":"Comparability","level":2,"score":0.2694999873638153},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.2540000081062317}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.04738","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.04738","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.04738","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.04738","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6092081665992737,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Instruction-following":[0],"is":[1,115],"a":[2,64,82,99,138],"foundational":[3],"capability":[4],"of":[5,25,37,107],"large":[6],"language":[7],"models":[8,28,109,131],"(LLMs),":[9],"with":[10,54,142],"its":[11],"improvement":[12],"hinging":[13],"on":[14,93,123],"scalable":[15],"and":[16,47,74,132,152],"accurate":[17],"feedback":[18],"from":[19],"judge":[20,27,108,130],"models.":[21],"However,":[22],"the":[23,105],"reliability":[24],"current":[26,129],"in":[29,117,128],"instruction-following":[30,69,94],"remains":[31],"underexplored":[32],"due":[33],"to":[34,110,147],"several":[35],"deficiencies":[36,127],"existing":[38,148],"meta-evaluation":[39,66],"benchmarks,":[40],"such":[41],"as":[42],"their":[43],"insufficient":[44],"data":[45,153],"coverage":[46],"oversimplified":[48],"pairwise":[49,87],"evaluation":[50,101],"paradigms":[51],"that":[52,70,103,134],"misalign":[53],"model":[55,119],"optimization":[56],"scenarios.":[57],"To":[58],"this":[59],"end,":[60],"we":[61,80],"propose":[62],"IF-RewardBench,":[63],"comprehensive":[65],"benchmark":[67,136],"for":[68],"covers":[71],"diverse":[72],"instruction":[73],"constraint":[75],"types.":[76],"For":[77],"each":[78],"instruction,":[79],"construct":[81],"preference":[83],"graph":[84],"containing":[85],"all":[86],"preferences":[88],"among":[89],"multiple":[90,112],"responses":[91],"based":[92],"quality.":[95],"This":[96],"design":[97],"enables":[98],"listwise":[100],"paradigm":[102],"assesses":[104],"capabilities":[106],"rank":[111],"responses,":[113],"which":[114],"essential":[116],"guiding":[118],"alignment.":[120],"Extensive":[121],"experiments":[122],"IF-RewardBench":[124],"reveal":[125],"significant":[126],"demonstrate":[133],"our":[135],"achieves":[137],"stronger":[139],"positive":[140],"correlation":[141],"downstream":[143],"task":[144],"performance":[145],"compared":[146],"benchmarks.":[149],"Our":[150],"codes":[151],"are":[154],"available":[155],"at":[156],"https://github.com/thu-coai/IF-RewardBench.":[157]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-07T00:00:00"}
