{"id":"https://openalex.org/W4387928541","doi":"https://doi.org/10.48550/arxiv.2310.13798","title":"Specific versus General Principles for Constitutional AI","display_name":"Specific versus General Principles for Constitutional AI","publication_year":2023,"publication_date":"2023-10-20","ids":{"openalex":"https://openalex.org/W4387928541","doi":"https://doi.org/10.48550/arxiv.2310.13798"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2310.13798","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2310.13798","pdf_url":"https://arxiv.org/pdf/2310.13798","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2310.13798","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010007563","display_name":"Sandipan Kundu","orcid":"https://orcid.org/0000-0002-0754-0072"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kundu, Sandipan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091860006","display_name":"Yuntao Bai","orcid":"https://orcid.org/0000-0003-3998-7837"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Yuntao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061506488","display_name":"Saurav Kadavath","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kadavath, Saurav","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030305998","display_name":"Amanda Askell","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Askell, Amanda","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031192921","display_name":"A. Callahan","orcid":"https://orcid.org/0009-0009-8752-4221"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Callahan, Andrew","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056436767","display_name":"Anna Chen","orcid":"https://orcid.org/0000-0002-5946-6572"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Anna","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088885573","display_name":"Anna Goldie","orcid":"https://orcid.org/0000-0002-4887-6293"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Goldie, Anna","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093123090","display_name":"Avital Balwit","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Balwit, Avital","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070731184","display_name":"Azalia Mirhoseini","orcid":"https://orcid.org/0000-0002-2440-0944"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mirhoseini, Azalia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113030770","display_name":"B. T. McLean","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"McLean, Brayden","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108225540","display_name":"Catherine Olsson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Olsson, Catherine","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093123089","display_name":"Cassie Evraets","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Evraets, Cassie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072334146","display_name":"Eli Tran-Johnson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tran-Johnson, Eli","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001994692","display_name":"Esin Durmus","orcid":"https://orcid.org/0009-0009-7331-8160"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Durmus, Esin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091112967","display_name":"Ethan Perez","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Perez, Ethan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057116059","display_name":"Jackson Kernion","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kernion, Jackson","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081035645","display_name":"Jamie Kerr","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kerr, Jamie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028970835","display_name":"Kamal Ndousse","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ndousse, Kamal","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032805177","display_name":"Karina Nguyen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen, Karina","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020683620","display_name":"Nelson Elhage","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Elhage, Nelson","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058959806","display_name":"Newton Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Newton","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050348824","display_name":"Nicholas Schiefer","orcid":"https://orcid.org/0000-0002-3065-6399"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schiefer, Nicholas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030783855","display_name":"Nova DasSarma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"DasSarma, Nova","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053939329","display_name":"Oliver Rausch","orcid":"https://orcid.org/0000-0003-4074-3848"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rausch, Oliver","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051064337","display_name":"Robin J. Larson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Larson, Robin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037725211","display_name":"Shannon Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Shannon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009112681","display_name":"Shauna Kravec","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kravec, Shauna","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001194068","display_name":"Timothy Telleen-Lawton","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Telleen-Lawton, Timothy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058639265","display_name":"Thomas I. Liao","orcid":"https://orcid.org/0009-0002-3936-1391"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liao, Thomas I.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049786610","display_name":"Tom Henighan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Henighan, Tom","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079539910","display_name":"Tristan Hume","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hume, Tristan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070402589","display_name":"Zac Hatfield-Dodds","orcid":"https://orcid.org/0000-0002-8646-8362"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hatfield-Dodds, Zac","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012960540","display_name":"S\u00f6ren Mindermann","orcid":"https://orcid.org/0000-0002-0315-9821"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mindermann, S\u00f6ren","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032088236","display_name":"Nicholas Joseph","orcid":"https://orcid.org/0000-0002-1972-0783"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Joseph, Nicholas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054887773","display_name":"Sam McCandlish","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"McCandlish, Sam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5053213601","display_name":"Jared Kaplan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaplan, Jared","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":36,"corresponding_author_ids":["https://openalex.org/A5010007563"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":7,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.9854000210762024,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.9854000210762024,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9142000079154968,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6000856161117554},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.5781172513961792},{"id":"https://openalex.org/keywords/humanity","display_name":"Humanity","score":0.5593786835670471},{"id":"https://openalex.org/keywords/constitution","display_name":"Constitution","score":0.5578398704528809},{"id":"https://openalex.org/keywords/value","display_name":"Value (mathematics)","score":0.5419313907623291},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.47518861293792725},{"id":"https://openalex.org/keywords/power","display_name":"Power (physics)","score":0.4392608106136322},{"id":"https://openalex.org/keywords/expression","display_name":"Expression (computer science)","score":0.42889201641082764},{"id":"https://openalex.org/keywords/law-and-economics","display_name":"Law and economics","score":0.3559364676475525},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.3422447443008423},{"id":"https://openalex.org/keywords/epistemology","display_name":"Epistemology","score":0.3057098984718323},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.27992749214172363},{"id":"https://openalex.org/keywords/law","display_name":"Law","score":0.23886993527412415},{"id":"https://openalex.org/keywords/sociology","display_name":"Sociology","score":0.20337402820587158},{"id":"https://openalex.org/keywords/political-science","display_name":"Political science","score":0.17915818095207214},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.0842808187007904},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.06824475526809692}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6000856161117554},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.5781172513961792},{"id":"https://openalex.org/C2780422510","wikidata":"https://www.wikidata.org/wiki/Q17027938","display_name":"Humanity","level":2,"score":0.5593786835670471},{"id":"https://openalex.org/C2776154427","wikidata":"https://www.wikidata.org/wiki/Q7755","display_name":"Constitution","level":2,"score":0.5578398704528809},{"id":"https://openalex.org/C2776291640","wikidata":"https://www.wikidata.org/wiki/Q2912517","display_name":"Value (mathematics)","level":2,"score":0.5419313907623291},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.47518861293792725},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.4392608106136322},{"id":"https://openalex.org/C90559484","wikidata":"https://www.wikidata.org/wiki/Q778379","display_name":"Expression (computer science)","level":2,"score":0.42889201641082764},{"id":"https://openalex.org/C190253527","wikidata":"https://www.wikidata.org/wiki/Q295354","display_name":"Law and economics","level":1,"score":0.3559364676475525},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.3422447443008423},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.3057098984718323},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27992749214172363},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.23886993527412415},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.20337402820587158},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.17915818095207214},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.0842808187007904},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.06824475526809692},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2310.13798","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2310.13798","pdf_url":"https://arxiv.org/pdf/2310.13798","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2310.13798","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2310.13798","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2310.13798","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2310.13798","pdf_url":"https://arxiv.org/pdf/2310.13798","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"score":0.7599999904632568,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4387928541.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2383552774","https://openalex.org/W2369613780","https://openalex.org/W2390120949","https://openalex.org/W2385858818","https://openalex.org/W2384237832","https://openalex.org/W2358247393","https://openalex.org/W2381199182","https://openalex.org/W2361406624","https://openalex.org/W2390750281","https://openalex.org/W2377568280"],"abstract_inverted_index":{"Human":[0],"feedback":[1,34,36],"can":[2,68,104],"prevent":[3],"overtly":[4],"harmful":[5,140],"utterances":[6],"in":[7,111,118],"conversational":[8],"models,":[9],"but":[10],"may":[11,126],"not":[12],"automatically":[13],"mitigate":[14],"subtle":[15],"problematic":[16],"behaviors":[17,73],"such":[18,57],"as":[19,91],"a":[20,43,76,87,133],"stated":[21,90,116],"desire":[22],"for":[23,95,132,164],"self-preservation":[24],"or":[25],"power.":[26,122],"Constitutional":[27],"AI":[28,38,166],"offers":[29],"an":[30],"alternative,":[31],"replacing":[32],"human":[33],"with":[35,114],"from":[37,74,106],"models":[39,69,103],"conditioned":[40],"only":[41,75],"on":[42],"list":[44,135],"of":[45,56,61,136,153],"written":[46,78],"principles.":[47],"We":[48,97],"find":[49,98],"this":[50,107],"approach":[51],"effectively":[52],"prevents":[53],"the":[54,100,130],"expression":[55],"behaviors.":[58,141],"The":[59],"success":[60],"simple":[62],"principles":[63,161],"motivates":[64],"us":[65],"to":[66],"ask:":[67],"learn":[70],"general":[71,124,158],"ethical":[72],"single":[77],"principle?":[79],"To":[80],"test":[81],"this,":[82],"we":[83],"run":[84],"experiments":[85],"using":[86],"principle":[88,125],"roughly":[89],"\"do":[92],"what's":[93],"best":[94],"humanity\".":[96],"that":[99],"largest":[101],"dialogue":[102],"generalize":[105],"short":[108],"constitution,":[109],"resulting":[110],"harmless":[112],"assistants":[113],"no":[115],"interest":[117],"specific":[119,151,160],"motivations":[120],"like":[121],"A":[123],"thus":[127],"partially":[128],"avoid":[129],"need":[131],"long":[134],"constitutions":[137,145],"targeting":[138],"potentially":[139],"However,":[142],"more":[143],"detailed":[144],"still":[146],"improve":[147],"fine-grained":[148],"control":[149],"over":[150],"types":[152],"harms.":[154],"This":[155],"suggests":[156],"both":[157],"and":[159],"have":[162],"value":[163],"steering":[165],"safely.":[167]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":1}],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2025-10-10T00:00:00"}
