{"id":"https://openalex.org/W4296413526","doi":"https://doi.org/10.48550/arxiv.2209.07858","title":"Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned","display_name":"Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned","publication_year":2022,"publication_date":"2022-08-23","ids":{"openalex":"https://openalex.org/W4296413526","doi":"https://doi.org/10.48550/arxiv.2209.07858"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2209.07858","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2209.07858","pdf_url":"https://arxiv.org/pdf/2209.07858","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2209.07858","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006294201","display_name":"Deep Ganguli","orcid":"https://orcid.org/0009-0007-9435-3817"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ganguli, Deep","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012135186","display_name":"Liane Lovitt","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lovitt, Liane","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074760713","display_name":"Jackson Kernion","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kernion, Jackson","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030305998","display_name":"Amanda Askell","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Askell, Amanda","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091860006","display_name":"Yuntao Bai","orcid":"https://orcid.org/0000-0003-3998-7837"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Yuntao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061506488","display_name":"Saurav Kadavath","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kadavath, Saurav","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110949689","display_name":"Ben Mann","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mann, Ben","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091112967","display_name":"Ethan Perez","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Perez, Ethan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050348824","display_name":"Nicholas Schiefer","orcid":"https://orcid.org/0000-0002-3065-6399"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schiefer, Nicholas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028970835","display_name":"Kamal Ndousse","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ndousse, Kamal","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011636042","display_name":"Andy Jones","orcid":"https://orcid.org/0000-0002-3130-9313"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jones, Andy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018067914","display_name":"Sam Bowman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bowman, Sam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056436767","display_name":"Anna Chen","orcid":"https://orcid.org/0000-0002-5946-6572"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Anna","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010209743","display_name":"Tom Conerly","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Conerly, Tom","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002341550","display_name":"Nova DasSarma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"DasSarma, Nova","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059021264","display_name":"Dawn Drain","orcid":"https://orcid.org/0000-0002-6606-4141"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Drain, Dawn","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020683620","display_name":"Nelson Elhage","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Elhage, Nelson","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049068767","display_name":"Sheer El-Showk","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"El-Showk, Sheer","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082696325","display_name":"Stanislav Fort","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fort, Stanislav","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043478841","display_name":"Zac Hatfield Dodds","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hatfield-Dodds, Zac","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049786610","display_name":"Tom Henighan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Henighan, Tom","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046599292","display_name":"Danny Hernandez","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hernandez, Danny","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026832201","display_name":"Tristan Hume","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hume, Tristan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083600526","display_name":"Josh Jacobson","orcid":"https://orcid.org/0000-0003-4418-2208"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jacobson, Josh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041888075","display_name":"Scott G. Johnston","orcid":"https://orcid.org/0000-0002-5826-5613"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Johnston, Scott","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009112681","display_name":"Shauna Kravec","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kravec, Shauna","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108225540","display_name":"Catherine Olsson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Olsson, Catherine","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017610025","display_name":"Sam Ringer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ringer, Sam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026326973","display_name":"Eli Tran-Johnson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tran-Johnson, Eli","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066197394","display_name":"Dario Amodei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Amodei, Dario","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074565235","display_name":"Tom Brown","orcid":"https://orcid.org/0000-0002-6538-3036"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brown, Tom","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032088236","display_name":"Nicholas Joseph","orcid":"https://orcid.org/0000-0002-1972-0783"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Joseph, Nicholas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054887773","display_name":"Sam McCandlish","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"McCandlish, Sam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039751155","display_name":"Chris Olah","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Olah, Chris","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053213601","display_name":"Jared Kaplan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaplan, Jared","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5012713248","display_name":"Jack A. Clark","orcid":"https://orcid.org/0000-0002-7424-1670"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Clark, Jack","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":36,"corresponding_author_ids":["https://openalex.org/A5006294201"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":98,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.9861000180244446,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.9861000180244446,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10994","display_name":"Terrorism, Counterterrorism, and Political Violence","score":0.9151999950408936,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/offensive","display_name":"Offensive","score":0.7943724989891052},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.7473785281181335},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6858421564102173},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5833725929260254},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5578718781471252},{"id":"https://openalex.org/keywords/transparency","display_name":"Transparency (behavior)","score":0.5129925608634949},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.454934686422348},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3897583782672882},{"id":"https://openalex.org/keywords/operations-research","display_name":"Operations research","score":0.26645588874816895},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.20644089579582214},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.11776718497276306}],"concepts":[{"id":"https://openalex.org/C176856949","wikidata":"https://www.wikidata.org/wiki/Q2001676","display_name":"Offensive","level":2,"score":0.7943724989891052},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.7473785281181335},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6858421564102173},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5833725929260254},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5578718781471252},{"id":"https://openalex.org/C2780233690","wikidata":"https://www.wikidata.org/wiki/Q535347","display_name":"Transparency (behavior)","level":2,"score":0.5129925608634949},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.454934686422348},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3897583782672882},{"id":"https://openalex.org/C42475967","wikidata":"https://www.wikidata.org/wiki/Q194292","display_name":"Operations research","level":1,"score":0.26645588874816895},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.20644089579582214},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.11776718497276306},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2209.07858","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2209.07858","pdf_url":"https://arxiv.org/pdf/2209.07858","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2209.07858","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2209.07858","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2209.07858","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2209.07858","pdf_url":"https://arxiv.org/pdf/2209.07858","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"score":0.699999988079071,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W1568520348","https://openalex.org/W3214407891","https://openalex.org/W3194113117","https://openalex.org/W4287020359","https://openalex.org/W3213194066","https://openalex.org/W268355439","https://openalex.org/W2967125893","https://openalex.org/W4385323698","https://openalex.org/W2385362579","https://openalex.org/W2380993274"],"abstract_inverted_index":{"We":[0,24,85,130,171],"describe":[1,160],"our":[2,116,132,161,177],"early":[3],"efforts":[4],"to":[5,12,18,58,73,94,125,150,179,187,197],"red":[6,35,95,120,169,198],"team":[7,96,121,199],"language":[8,52,149,200],"models":[9,90],"in":[10,185],"order":[11,186],"simultaneously":[13],"discover,":[14],"measure,":[15],"and":[16,43,46,62,69,76,100,127,138,166,192],"attempt":[17],"reduce":[19],"their":[20],"potentially":[21],"harmful":[22,143,153],"outputs.":[23,156],"make":[25],"three":[26],"main":[27],"contributions.":[28],"First,":[29],"we":[30,101,114,158],"investigate":[31],"scaling":[32],"behaviors":[33],"for":[34,108,123,195],"teaming":[36],"across":[37],"3":[38],"model":[39,48,53,71,111],"sizes":[40],"(2.7B,":[41],"13B,":[42],"52B":[44],"parameters)":[45],"4":[47],"types:":[49],"a":[50,70,103,140,183],"plain":[51],"(LM);":[54],"an":[55,64],"LM":[56,65],"prompted":[57],"be":[59,74],"helpful,":[60],"honest,":[61],"harmless;":[63],"with":[66,106],"rejection":[67],"sampling;":[68],"trained":[72],"helpful":[75],"harmless":[77],"using":[78],"reinforcement":[79],"learning":[80],"from":[81,147],"human":[82],"feedback":[83],"(RLHF).":[84],"find":[86,102,139],"that":[87,173],"the":[88,109,136],"RLHF":[89],"are":[91],"increasingly":[92],"difficult":[93],"as":[97,182],"they":[98],"scale,":[99],"flat":[104],"trend":[105],"scale":[107],"other":[110],"types.":[112],"Second,":[113],"release":[115],"dataset":[117],"of":[118,135,142],"38,961":[119],"attacks":[122],"others":[124],"analyze":[126],"learn":[128],"from.":[129],"provide":[131],"own":[133],"analysis":[134],"data":[137],"variety":[141],"outputs,":[144],"which":[145],"range":[146],"offensive":[148],"more":[151],"subtly":[152],"non-violent":[154],"unethical":[155],"Third,":[157],"exhaustively":[159],"instructions,":[162],"processes,":[163],"statistical":[164],"methodologies,":[165],"uncertainty":[167],"about":[168],"teaming.":[170],"hope":[172],"this":[174],"transparency":[175],"accelerates":[176],"ability":[178],"work":[180],"together":[181],"community":[184],"develop":[188],"shared":[189],"norms,":[190],"practices,":[191],"technical":[193],"standards":[194],"how":[196],"models.":[201]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":18},{"year":2024,"cited_by_count":34},{"year":2023,"cited_by_count":41}],"updated_date":"2026-02-13T15:27:49.765798","created_date":"2022-09-20T00:00:00"}
