{"id":"https://openalex.org/W4411039658","doi":"https://doi.org/10.1007/s10676-025-09837-2","title":"Helpful, harmless, honest? Sociotechnical limits of AI alignment and safety through Reinforcement Learning from Human Feedback","display_name":"Helpful, harmless, honest? Sociotechnical limits of AI alignment and safety through Reinforcement Learning from Human Feedback","publication_year":2025,"publication_date":"2025-06-01","ids":{"openalex":"https://openalex.org/W4411039658","doi":"https://doi.org/10.1007/s10676-025-09837-2","pmid":"https://pubmed.ncbi.nlm.nih.gov/40486676"},"language":"en","primary_location":{"id":"doi:10.1007/s10676-025-09837-2","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10676-025-09837-2","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10676-025-09837-2.pdf","source":{"id":"https://openalex.org/S13096939","display_name":"Ethics and Information Technology","issn_l":"1388-1957","issn":["1388-1957","1572-8439"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Ethics and Information Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s10676-025-09837-2.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5038451945","display_name":"Adam Dahlgren Lindstr\u00f6m","orcid":"https://orcid.org/0000-0002-1112-2981"},"institutions":[{"id":"https://openalex.org/I90267481","display_name":"Ume\u00e5 University","ror":"https://ror.org/05kb8h459","country_code":"SE","type":"education","lineage":["https://openalex.org/I90267481"]}],"countries":["SE"],"is_corresponding":true,"raw_author_name":"Adam Dahlgren Lindstr\u00f6m","raw_affiliation_strings":["Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187 Sweden","Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187, Sweden"],"affiliations":[{"raw_affiliation_string":"Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187 Sweden","institution_ids":[]},{"raw_affiliation_string":"Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187, Sweden","institution_ids":["https://openalex.org/I90267481"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064353168","display_name":"Leila Methnani","orcid":"https://orcid.org/0000-0002-9808-2037"},"institutions":[{"id":"https://openalex.org/I90267481","display_name":"Ume\u00e5 University","ror":"https://ror.org/05kb8h459","country_code":"SE","type":"education","lineage":["https://openalex.org/I90267481"]}],"countries":["SE"],"is_corresponding":false,"raw_author_name":"Leila Methnani","raw_affiliation_strings":["Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187 Sweden","Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187, Sweden"],"affiliations":[{"raw_affiliation_string":"Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187 Sweden","institution_ids":[]},{"raw_affiliation_string":"Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187, Sweden","institution_ids":["https://openalex.org/I90267481"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063696941","display_name":"Lea Krause","orcid":null},"institutions":[{"id":"https://openalex.org/I865915315","display_name":"Vrije Universiteit Amsterdam","ror":"https://ror.org/008xxew50","country_code":"NL","type":"education","lineage":["https://openalex.org/I865915315"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Lea Krause","raw_affiliation_strings":["Department of Computing Science, Vrije Universiteit Amsterdam, Amsterdam, 1081 De Boelelaan 1105, Netherlands","Department of Computing Science, Vrije Universiteit Amsterdam, Amsterdam, 1081, De Boelelaan 1105, Netherlands"],"affiliations":[{"raw_affiliation_string":"Department of Computing Science, Vrije Universiteit Amsterdam, Amsterdam, 1081 De Boelelaan 1105, Netherlands","institution_ids":[]},{"raw_affiliation_string":"Department of Computing Science, Vrije Universiteit Amsterdam, Amsterdam, 1081, De Boelelaan 1105, Netherlands","institution_ids":["https://openalex.org/I865915315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006944889","display_name":"Petter Ericson","orcid":"https://orcid.org/0000-0002-8722-5661"},"institutions":[{"id":"https://openalex.org/I90267481","display_name":"Ume\u00e5 University","ror":"https://ror.org/05kb8h459","country_code":"SE","type":"education","lineage":["https://openalex.org/I90267481"]}],"countries":["SE"],"is_corresponding":false,"raw_author_name":"Petter Ericson","raw_affiliation_strings":["Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187 Sweden","Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187, Sweden"],"affiliations":[{"raw_affiliation_string":"Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187 Sweden","institution_ids":[]},{"raw_affiliation_string":"Department of Computing Science, Ume\u00e5 University, Ume\u00e5, 90187, Sweden","institution_ids":["https://openalex.org/I90267481"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033809033","display_name":"\u00cd\u00f1igo Mart\u00ednez de Rituerto de Troya","orcid":"https://orcid.org/0000-0003-4054-7943"},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"\u00cd\u00f1igo Mart\u00ednez de Rituerto de Troya","raw_affiliation_strings":["Department of Engineering, Systems and Services, Delft University of Technology, Delft, 2600 Netherlands","Department of Engineering, Systems and Services, Delft University of Technology, Delft, 2600, Netherlands"],"affiliations":[{"raw_affiliation_string":"Department of Engineering, Systems and Services, Delft University of Technology, Delft, 2600 Netherlands","institution_ids":[]},{"raw_affiliation_string":"Department of Engineering, Systems and Services, Delft University of Technology, Delft, 2600, Netherlands","institution_ids":["https://openalex.org/I98358874"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028262474","display_name":"Dimitri Coelho Mollo","orcid":"https://orcid.org/0000-0002-0464-3535"},"institutions":[{"id":"https://openalex.org/I90267481","display_name":"Ume\u00e5 University","ror":"https://ror.org/05kb8h459","country_code":"SE","type":"education","lineage":["https://openalex.org/I90267481"]}],"countries":["SE"],"is_corresponding":false,"raw_author_name":"Dimitri Coelho Mollo","raw_affiliation_strings":["Department of Historical, Philosophical, and Religious Studies, Ume\u00e5 University, Ume\u00e5, 90187 Sweden","Department of Historical, Philosophical, and Religious Studies, Ume\u00e5 University, Ume\u00e5, 90187, Sweden"],"affiliations":[{"raw_affiliation_string":"Department of Historical, Philosophical, and Religious Studies, Ume\u00e5 University, Ume\u00e5, 90187 Sweden","institution_ids":[]},{"raw_affiliation_string":"Department of Historical, Philosophical, and Religious Studies, Ume\u00e5 University, Ume\u00e5, 90187, Sweden","institution_ids":["https://openalex.org/I90267481"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5070294446","display_name":"Roel Dobbe","orcid":"https://orcid.org/0000-0003-4633-7023"},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Roel Dobbe","raw_affiliation_strings":["Department of Engineering, Systems and Services, Delft University of Technology, Delft, 2600 Netherlands","Department of Engineering, Systems and Services, Delft University of Technology, Delft, 2600, Netherlands"],"affiliations":[{"raw_affiliation_string":"Department of Engineering, Systems and Services, Delft University of Technology, Delft, 2600 Netherlands","institution_ids":[]},{"raw_affiliation_string":"Department of Engineering, Systems and Services, Delft University of Technology, Delft, 2600, Netherlands","institution_ids":["https://openalex.org/I98358874"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5038451945"],"corresponding_institution_ids":["https://openalex.org/I90267481"],"apc_list":{"value":2290,"currency":"EUR","value_usd":2890},"apc_paid":{"value":2290,"currency":"EUR","value_usd":2890},"fwci":38.9533,"has_fulltext":true,"cited_by_count":19,"citation_normalized_percentile":{"value":0.99779551,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"27","issue":"2","first_page":"28","last_page":"28"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.9021000266075134,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sociotechnical-system","display_name":"Sociotechnical system","score":0.9173941612243652},{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.5625353455543518},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5428831577301025},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4973337948322296},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.48450157046318054},{"id":"https://openalex.org/keywords/normative","display_name":"Normative","score":0.4403054416179657},{"id":"https://openalex.org/keywords/implementation","display_name":"Implementation","score":0.42733579874038696},{"id":"https://openalex.org/keywords/deception","display_name":"Deception","score":0.42104077339172363},{"id":"https://openalex.org/keywords/engineering-ethics","display_name":"Engineering ethics","score":0.36057281494140625},{"id":"https://openalex.org/keywords/knowledge-management","display_name":"Knowledge management","score":0.3481558561325073},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3422490060329437},{"id":"https://openalex.org/keywords/management-science","display_name":"Management science","score":0.3396932780742645},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.2309558391571045},{"id":"https://openalex.org/keywords/social-psychology","display_name":"Social psychology","score":0.17242804169654846},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.16111546754837036},{"id":"https://openalex.org/keywords/law","display_name":"Law","score":0.10934367775917053},{"id":"https://openalex.org/keywords/political-science","display_name":"Political science","score":0.10357663035392761},{"id":"https://openalex.org/keywords/software-engineering","display_name":"Software engineering","score":0.10151869058609009}],"concepts":[{"id":"https://openalex.org/C127627568","wikidata":"https://www.wikidata.org/wiki/Q1639361","display_name":"Sociotechnical system","level":2,"score":0.9173941612243652},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.5625353455543518},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5428831577301025},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4973337948322296},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.48450157046318054},{"id":"https://openalex.org/C44725695","wikidata":"https://www.wikidata.org/wiki/Q288156","display_name":"Normative","level":2,"score":0.4403054416179657},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.42733579874038696},{"id":"https://openalex.org/C2779267917","wikidata":"https://www.wikidata.org/wiki/Q170028","display_name":"Deception","level":2,"score":0.42104077339172363},{"id":"https://openalex.org/C55587333","wikidata":"https://www.wikidata.org/wiki/Q1133029","display_name":"Engineering ethics","level":1,"score":0.36057281494140625},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.3481558561325073},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3422490060329437},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.3396932780742645},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2309558391571045},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.17242804169654846},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.16111546754837036},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.10934367775917053},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.10357663035392761},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.10151869058609009},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0}],"mesh":[],"locations_count":5,"locations":[{"id":"doi:10.1007/s10676-025-09837-2","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10676-025-09837-2","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10676-025-09837-2.pdf","source":{"id":"https://openalex.org/S13096939","display_name":"Ethics and Information Technology","issn_l":"1388-1957","issn":["1388-1957","1572-8439"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Ethics and Information Technology","raw_type":"journal-article"},{"id":"pmid:40486676","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/40486676","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Ethics and information technology","raw_type":null},{"id":"pmh:oai:research.vu.nl:openaire/9abfb3ba-f0d2-4d79-bded-3c14f9c512a3","is_oa":true,"landing_page_url":"https://research.vu.nl/en/publications/9abfb3ba-f0d2-4d79-bded-3c14f9c512a3","pdf_url":null,"source":{"id":"https://openalex.org/S4306401107","display_name":"VU Research Portal","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I865915315","host_organization_name":"Vrije Universiteit Amsterdam","host_organization_lineage":["https://openalex.org/I865915315"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Dahlgren Lindstr\u00f6m, A, Methnani, L, Krause, L, Ericson, P, de Rituerto de Troya, \u00cd M, Coelho Mollo, D & Dobbe, R 2025, 'Helpful, harmless, honest? Sociotechnical limits of AI alignment and safety through Reinforcement Learning from Human Feedback', Ethics and Information Technology, vol. 27, no. 2, 28, pp. 1-13. https://doi.org/10.1007/s10676-025-09837-2","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:pubmedcentral.nih.gov:12137480","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/12137480","pdf_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC12137480/pdf/10676_2025_Article_9837.pdf","source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Ethics Inf Technol","raw_type":"Text"},{"id":"pmh:oai:DiVA.org:umu-239637","is_oa":true,"landing_page_url":"http://urn.kb.se/resolve?urn=urn:nbn:se:umu:diva-239637","pdf_url":null,"source":{"id":"https://openalex.org/S4306400361","display_name":"DiVA at Ume\u00e5 University (Ume\u00e5 University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I90267481","host_organization_name":"Ume\u00e5 University","host_organization_lineage":["https://openalex.org/I90267481"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1007/s10676-025-09837-2","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s10676-025-09837-2","pdf_url":"https://link.springer.com/content/pdf/10.1007/s10676-025-09837-2.pdf","source":{"id":"https://openalex.org/S13096939","display_name":"Ethics and Information Technology","issn_l":"1388-1957","issn":["1388-1957","1572-8439"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Ethics and Information Technology","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2124936668","display_name":null,"funder_award_id":"101120237","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G5550786340","display_name":null,"funder_award_id":"024.004.022","funder_id":"https://openalex.org/F4320321800","funder_display_name":"Nederlandse Organisatie voor Wetenschappelijk Onderzoek"}],"funders":[{"id":"https://openalex.org/F4320320300","display_name":"European Commission","ror":"https://ror.org/00k4n6c32"},{"id":"https://openalex.org/F4320321800","display_name":"Nederlandse Organisatie voor Wetenschappelijk Onderzoek","ror":"https://ror.org/04jsz6e67"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4411039658.pdf","grobid_xml":"https://content.openalex.org/works/W4411039658.grobid-xml"},"referenced_works_count":54,"referenced_works":["https://openalex.org/W1994833890","https://openalex.org/W2044008035","https://openalex.org/W2264742718","https://openalex.org/W2626804490","https://openalex.org/W2896457183","https://openalex.org/W2948947170","https://openalex.org/W2957654274","https://openalex.org/W2990138404","https://openalex.org/W3013207429","https://openalex.org/W3035032094","https://openalex.org/W3040416015","https://openalex.org/W3040472729","https://openalex.org/W3133702157","https://openalex.org/W3144277077","https://openalex.org/W3181893318","https://openalex.org/W4226278401","https://openalex.org/W4245587674","https://openalex.org/W4245987285","https://openalex.org/W4283157705","https://openalex.org/W4285155368","https://openalex.org/W4296638419","https://openalex.org/W4307697543","https://openalex.org/W4312050653","https://openalex.org/W4375958700","https://openalex.org/W4378044967","https://openalex.org/W4378771755","https://openalex.org/W4380354660","https://openalex.org/W4383665181","https://openalex.org/W4383752090","https://openalex.org/W4383993628","https://openalex.org/W4385774833","https://openalex.org/W4386242333","https://openalex.org/W4386249234","https://openalex.org/W4386981810","https://openalex.org/W4387389810","https://openalex.org/W4387892136","https://openalex.org/W4388481711","https://openalex.org/W4389523709","https://openalex.org/W4390619488","https://openalex.org/W4391282676","https://openalex.org/W4393160410","https://openalex.org/W4395025834","https://openalex.org/W4396796749","https://openalex.org/W4397002468","https://openalex.org/W4401768995","https://openalex.org/W4404342287","https://openalex.org/W4405473235","https://openalex.org/W4406482940","https://openalex.org/W4406975817","https://openalex.org/W6838542245","https://openalex.org/W6860760705","https://openalex.org/W6871015819","https://openalex.org/W6928748004","https://openalex.org/W7036906873"],"related_works":["https://openalex.org/W2905433371","https://openalex.org/W1980714815","https://openalex.org/W3133630643","https://openalex.org/W608490485","https://openalex.org/W2888392564","https://openalex.org/W2481729736","https://openalex.org/W4389636114","https://openalex.org/W4310278675","https://openalex.org/W2236208621","https://openalex.org/W2967475239"],"abstract_inverted_index":{"This":[0],"paper":[1],"critically":[2],"evaluates":[3],"the":[4,40,43,61,78,93,100,126,167,180],"attempts":[5],"to":[6,76,85,115,179],"align":[7],"Artificial":[8],"Intelligence":[9],"(AI)":[10],"systems,":[11,164],"especially":[12],"Large":[13],"Language":[14],"Models":[15],"(LLMs),":[16],"with":[17],"human":[18,30,81],"values":[19],"and":[20,51,64,83,105,122,130,133,135,146,162,165,182],"intentions":[21],"through":[22],"Reinforcement":[23],"Learning":[24],"from":[25],"Feedback":[26],"methods,":[27],"involving":[28],"either":[29],"feedback":[31,35],"(RLHF)":[32],"or":[33],"AI":[34,86,144,170],"(RLAIF).":[36],"Specifically,":[37],"we":[38,58,109],"show":[39],"shortcomings":[41],"of":[42,48,67,80,95,156,169,185],"broadly":[44],"pursued":[45],"alignment":[46,121],"goals":[47,94],"honesty,":[49],"harmlessness,":[50],"helpfulness.":[52],"Through":[53],"a":[54,153,173],"multidisciplinary":[55],"sociotechnical":[56,174],"critique,":[57],"examine":[59],"both":[60],"theoretical":[62],"underpinnings":[63],"practical":[65],"implementations":[66],"RLHF":[68,150],"techniques,":[69],"revealing":[70],"significant":[71],"limitations":[72],"in":[73,92,99,118],"their":[74],"approach":[75],"capturing":[77],"complexities":[79],"ethics,":[82],"contributing":[84],"safety.":[87,137],"We":[88,138],"highlight":[89],"tensions":[90],"inherent":[91],"RLHF,":[96,123],"as":[97,172],"captured":[98],"HHH":[101],"principle":[102],"(helpful,":[103],"harmless":[104],"honest).":[106],"In":[107],"addition,":[108],"discuss":[110],"ethically-relevant":[111],"issues":[112],"that":[113,176],"tend":[114],"be":[116],"neglected":[117],"discussions":[119],"about":[120],"among":[124],"which":[125,148],"trade-offs":[127],"between":[128],"user-friendliness":[129],"deception,":[131],"flexibility":[132],"interpretability,":[134],"system":[136],"offer":[139],"an":[140],"alternative":[141],"vision":[142],"for":[143],"safety":[145,171],"ethics":[147],"positions":[149],"approaches":[151],"within":[152],"broader":[154],"context":[155],"comprehensive":[157],"design":[158],"across":[159],"institutions,":[160],"processes":[161],"technological":[163],"suggest":[166],"establishment":[168],"discipline":[175],"is":[177],"open":[178],"normative":[181],"political":[183],"dimensions":[184],"artificial":[186],"intelligence.":[187]},"counts_by_year":[{"year":2026,"cited_by_count":12},{"year":2025,"cited_by_count":7}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
