{"id":"https://openalex.org/W4400435090","doi":"https://doi.org/10.48550/arxiv.2407.04622","title":"On scalable oversight with weak LLMs judging strong LLMs","display_name":"On scalable oversight with weak LLMs judging strong LLMs","publication_year":2024,"publication_date":"2024-07-05","ids":{"openalex":"https://openalex.org/W4400435090","doi":"https://doi.org/10.48550/arxiv.2407.04622"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2407.04622","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.04622","pdf_url":"https://arxiv.org/pdf/2407.04622","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2407.04622","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069438696","display_name":"Zachary Kenton","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kenton, Zachary","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258054","display_name":"Noah Y. Siegel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Siegel, Noah Y.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027979882","display_name":"J\u00e1nos Kram\u00e1r","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kram\u00e1r, J\u00e1nos","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258055","display_name":"Jonah Brown-Cohen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brown-Cohen, Jonah","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258056","display_name":"Samuel Albanie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Albanie, Samuel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258057","display_name":"Jannis Bulian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bulian, Jannis","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258058","display_name":"Rishabh Agarwal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Agarwal, Rishabh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059597480","display_name":"David Lindner","orcid":"https://orcid.org/0000-0001-7051-7433"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lindner, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258059","display_name":"Yunhao Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Yunhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258053","display_name":"Noah D. Goodman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Goodman, Noah D.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5012971694","display_name":"Rohin Shah","orcid":"https://orcid.org/0000-0002-0656-2800"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shah, Rohin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5069438696"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":6,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.9304999709129333,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.9304999709129333,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11182","display_name":"Auction Theory and Applications","score":0.9271000027656555,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10927","display_name":"Access Control and Trust","score":0.9071000218391418,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/business","display_name":"Business","score":0.37640172243118286},{"id":"https://openalex.org/keywords/political-science","display_name":"Political science","score":0.35674577951431274}],"concepts":[{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.37640172243118286},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.35674577951431274}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2407.04622","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.04622","pdf_url":"https://arxiv.org/pdf/2407.04622","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2407.04622","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2407.04622","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2407.04622","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.04622","pdf_url":"https://arxiv.org/pdf/2407.04622","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4400435090.pdf","grobid_xml":"https://content.openalex.org/works/W4400435090.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2949263084","https://openalex.org/W2743539335","https://openalex.org/W594353338","https://openalex.org/W2765153054","https://openalex.org/W2922049016","https://openalex.org/W4390697879","https://openalex.org/W2596173151","https://openalex.org/W2070214669"],"abstract_inverted_index":{"Scalable":[0],"oversight":[1],"protocols":[2],"aim":[3],"to":[4,7,22,32,41,76,106,131,139,180,187,192],"enable":[5],"humans":[6],"accurately":[8],"supervise":[9],"superhuman":[10],"AI.":[11,55],"In":[12],"this":[13],"paper":[14],"we":[15,184,195,212],"study":[16],"debate,":[17],"where":[18,27,47],"two":[19],"AI's":[20],"compete":[21],"convince":[23,33],"a":[24,28,34,42,85,98],"judge;":[25],"consultancy,":[26],"single":[29,99],"AI":[30,64],"tries":[31],"judge":[35,49,74,219],"that":[36,118,214],"asks":[37],"questions;":[38],"and":[39,66,92,112],"compare":[40],"baseline":[43],"of":[44,88,149],"direct":[45,140,160],"question-answering,":[46],"the":[48,54,73,126,134,143,147,170,203],"just":[50],"answers":[51],"outright":[52],"without":[53,167],"We":[56,82,116],"use":[57],"large":[58],"language":[59],"models":[60,75,217],"(LLMs)":[61],"as":[62,67],"both":[63],"agents":[65],"stand-ins":[68],"for":[69,133],"human":[70],"judges,":[71],"taking":[72],"be":[77],"weaker":[78],"than":[79,208,224],"agent":[80],"models.":[81],"benchmark":[83],"on":[84,97,146],"diverse":[86],"range":[87],"asymmetries":[89],"between":[90],"judges":[91,197],"agents,":[93],"extending":[94],"previous":[95,226],"work":[96,175],"extractive":[100,152],"QA":[101,153],"task":[102],"with":[103,155],"information":[104,156,168],"asymmetry,":[105],"also":[107],"include":[108],"mathematics,":[109],"coding,":[110],"logic":[111],"multimodal":[113],"reasoning":[114],"asymmetries.":[115],"find":[117,196,213],"debate":[119,138,158,207],"outperforms":[120,159],"consultancy":[121],"across":[122],"all":[123],"tasks":[124,154,166],"when":[125],"consultant":[127],"is":[128],"randomly":[129],"assigned":[130,176],"argue":[132,181,193],"correct/incorrect":[135],"answer.":[136],"Comparing":[137],"question":[141,161],"answering,":[142,162],"results":[144,171],"depend":[145],"type":[148],"task:":[150],"in":[151,164,206,209,225],"asymmetry":[157,169],"but":[163],"other":[165],"are":[172,198],"mixed.":[173],"Previous":[174],"debaters/consultants":[177],"an":[178],"answer":[179,191,205],"for.":[182],"When":[183],"allow":[185],"them":[186],"instead":[188],"choose":[189],"which":[190],"for,":[194],"less":[199],"frequently":[200],"convinced":[201],"by":[202],"wrong":[204],"consultancy.":[210],"Further,":[211],"stronger":[215],"debater":[216],"increase":[218],"accuracy,":[220],"though":[221],"more":[222],"modestly":[223],"studies.":[227]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":2}],"updated_date":"2026-03-13T16:22:10.518609","created_date":"2025-10-10T00:00:00"}
