{"id":"https://openalex.org/W3199369541","doi":"https://doi.org/10.18653/v1/2021.findings-emnlp.210","title":"Challenges in Detoxifying Language Models","display_name":"Challenges in Detoxifying Language Models","publication_year":2021,"publication_date":"2021-01-01","ids":{"openalex":"https://openalex.org/W3199369541","doi":"https://doi.org/10.18653/v1/2021.findings-emnlp.210","mag":"3199369541"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2021.findings-emnlp.210","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2021.findings-emnlp.210","pdf_url":"https://aclanthology.org/2021.findings-emnlp.210.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2021","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2021.findings-emnlp.210.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067022550","display_name":"Johannes Welbl","orcid":null},"institutions":[{"id":"https://openalex.org/I45129253","display_name":"University College London","ror":"https://ror.org/02jx3x895","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I45129253"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Johannes Welbl","raw_affiliation_strings":["University College London, London, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University College London, London, United Kingdom","institution_ids":["https://openalex.org/I45129253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005117726","display_name":"Amelia Glaese","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Amelia Glaese","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059226057","display_name":"Jonathan Uesato","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jonathan Uesato","raw_affiliation_strings":["Google (United States), Mountain View, United States"],"affiliations":[{"raw_affiliation_string":"Google (United States), Mountain View, United States","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049998479","display_name":"Sumanth Dathathri","orcid":null},"institutions":[{"id":"https://openalex.org/I122411786","display_name":"California Institute of Technology","ror":"https://ror.org/05dxps055","country_code":"US","type":"education","lineage":["https://openalex.org/I122411786"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sumanth Dathathri","raw_affiliation_strings":["California Institute of Technology, Pasadena, United States"],"affiliations":[{"raw_affiliation_string":"California Institute of Technology, Pasadena, United States","institution_ids":["https://openalex.org/I122411786"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111981498","display_name":"John Mellor","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"John Mellor","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020758501","display_name":"Lisa Anne Hendricks","orcid":"https://orcid.org/0000-0001-9340-5143"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lisa Anne Hendricks","raw_affiliation_strings":["University of California, Berkeley, Berkeley, United States"],"affiliations":[{"raw_affiliation_string":"University of California, Berkeley, Berkeley, United States","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069590583","display_name":"Kirsty Anderson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kirsty Anderson","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013834379","display_name":"Pushmeet Kohli","orcid":"https://orcid.org/0000-0002-7466-7997"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Pushmeet Kohli","raw_affiliation_strings":["Microsoft (United States), Redmond, United States"],"affiliations":[{"raw_affiliation_string":"Microsoft (United States), Redmond, United States","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089424256","display_name":"Ben Coppin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ben Coppin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5076474156","display_name":"Po-Sen Huang","orcid":"https://orcid.org/0000-0003-1470-0991"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Po-Sen Huang","raw_affiliation_strings":["Google (United States), Mountain View, United States"],"affiliations":[{"raw_affiliation_string":"Google (United States), Mountain View, United States","institution_ids":["https://openalex.org/I1291425158"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5067022550"],"corresponding_institution_ids":["https://openalex.org/I45129253"],"apc_list":null,"apc_paid":null,"fwci":0.28220715,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.63487858,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"2447","last_page":"2469"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9843000173568726,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.975600004196167,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.685120701789856},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5215492248535156},{"id":"https://openalex.org/keywords/work","display_name":"Work (physics)","score":0.44547465443611145},{"id":"https://openalex.org/keywords/toxicity","display_name":"Toxicity","score":0.4172626733779907},{"id":"https://openalex.org/keywords/risk-analysis","display_name":"Risk analysis (engineering)","score":0.38280007243156433},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.33282142877578735},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.11674511432647705},{"id":"https://openalex.org/keywords/business","display_name":"Business","score":0.08991670608520508},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.07546713948249817}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.685120701789856},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5215492248535156},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.44547465443611145},{"id":"https://openalex.org/C29730261","wikidata":"https://www.wikidata.org/wiki/Q274160","display_name":"Toxicity","level":2,"score":0.4172626733779907},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.38280007243156433},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33282142877578735},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.11674511432647705},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.08991670608520508},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.07546713948249817},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C78519656","wikidata":"https://www.wikidata.org/wiki/Q101333","display_name":"Mechanical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.18653/v1/2021.findings-emnlp.210","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2021.findings-emnlp.210","pdf_url":"https://aclanthology.org/2021.findings-emnlp.210.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2021","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2109.07445","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2109.07445","pdf_url":"https://arxiv.org/pdf/2109.07445","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"mag:3199369541","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2109.07445.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2109.07445","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2109.07445","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.18653/v1/2021.findings-emnlp.210","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2021.findings-emnlp.210","pdf_url":"https://aclanthology.org/2021.findings-emnlp.210.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2021","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.800000011920929,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3199369541.pdf","grobid_xml":"https://content.openalex.org/works/W3199369541.grobid-xml"},"referenced_works_count":51,"referenced_works":["https://openalex.org/W78136081","https://openalex.org/W80056832","https://openalex.org/W1964164866","https://openalex.org/W2099057450","https://openalex.org/W2473344385","https://openalex.org/W2473555522","https://openalex.org/W2540646130","https://openalex.org/W2585712495","https://openalex.org/W2595653137","https://openalex.org/W2791170418","https://openalex.org/W2803517648","https://openalex.org/W2920807444","https://openalex.org/W2922580172","https://openalex.org/W2942370121","https://openalex.org/W2948223045","https://openalex.org/W2949678053","https://openalex.org/W2962937198","https://openalex.org/W2962990575","https://openalex.org/W2963206148","https://openalex.org/W2963250244","https://openalex.org/W2963341956","https://openalex.org/W2963494889","https://openalex.org/W2964110616","https://openalex.org/W2964121744","https://openalex.org/W2964235839","https://openalex.org/W2971307358","https://openalex.org/W2972735048","https://openalex.org/W2982756474","https://openalex.org/W2993398598","https://openalex.org/W2996287690","https://openalex.org/W3017311573","https://openalex.org/W3034238904","https://openalex.org/W3034937117","https://openalex.org/W3082274269","https://openalex.org/W3093233911","https://openalex.org/W3100355250","https://openalex.org/W3101767999","https://openalex.org/W3102924767","https://openalex.org/W3115772171","https://openalex.org/W3124120397","https://openalex.org/W3134354193","https://openalex.org/W3135734416","https://openalex.org/W3135773605","https://openalex.org/W3153490941","https://openalex.org/W3153611199","https://openalex.org/W3155742828","https://openalex.org/W3156216837","https://openalex.org/W3184144760","https://openalex.org/W3185376810","https://openalex.org/W3190860428","https://openalex.org/W3216852152"],"related_works":["https://openalex.org/W3212281239","https://openalex.org/W2992347006","https://openalex.org/W2950032996","https://openalex.org/W3201086525","https://openalex.org/W2981977471","https://openalex.org/W2947662160","https://openalex.org/W3155909748","https://openalex.org/W3107855336","https://openalex.org/W2890132602","https://openalex.org/W2072992969","https://openalex.org/W3016061304","https://openalex.org/W3178267322","https://openalex.org/W3156230701","https://openalex.org/W3188395949","https://openalex.org/W2990814745","https://openalex.org/W3092149645","https://openalex.org/W3136204722","https://openalex.org/W2992108934","https://openalex.org/W1775247540","https://openalex.org/W2785540181"],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LM)":[3],"generate":[4],"remarkably":[5],"fluent":[6],"text":[7,23],"and":[8,17,65,68,79,113],"can":[9,89],"be":[10],"efficiently":[11],"adapted":[12],"across":[13],"NLP":[14],"tasks.":[15],"Measuring":[16],"guaranteeing":[18],"the":[19,34,97,103,139],"quality":[20],"of":[21,26,47,71,76,105,145],"generated":[22],"in":[24,33,74,142],"terms":[25,75],"safety":[27],"is":[28],"imperative":[29],"for":[30,109],"deploying":[31],"LMs":[32],"real":[35],"world;":[36],"to":[37,62],"this":[38,53,100],"end,":[39],"prior":[40],"work":[41],"often":[42,124],"relies":[43],"on":[44,96],"automatic":[45,64,94,128],"evaluation":[46,144],"LM":[48,80,107,146],"toxicity.":[49,147],"We":[50,82],"critically":[51],"discuss":[52],"approach,":[54],"evaluate":[55],"several":[56],"toxicity":[57,72,129,133],"mitigation":[58,73],"strategies":[59,88],"with":[60,126],"respect":[61],"both":[63,110],"human":[66,122],"evaluation,":[67],"analyze":[69],"consequences":[70],"model":[77],"bias":[78],"quality.":[81],"demonstrate":[83],"that":[84,121],"while":[85],"basic":[86],"intervention":[87],"effectively":[90],"optimize":[91],"previously":[92],"established":[93],"metrics":[95],"RealToxicityPrompts":[98],"dataset,":[99],"comes":[101],"at":[102],"cost":[104],"reduced":[106],"coverage":[108],"texts":[111],"about,":[112],"dialects":[114],"of,":[115],"marginalized":[116],"groups.":[117],"Additionally,":[118],"we":[119],"find":[120],"raters":[123],"disagree":[125],"high":[127],"scores":[130],"after":[131],"strong":[132],"reduction":[134],"interventions":[135],"--":[136],"highlighting":[137],"further":[138],"nuances":[140],"involved":[141],"careful":[143]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2021,"cited_by_count":2}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
