{"id":"https://openalex.org/W7148324062","doi":"https://doi.org/10.48550/arxiv.2604.00021","title":"How Do Language Models Process Ethical Instructions? Deliberation, Consistency, and Other-Recognition Across Four Models","display_name":"How Do Language Models Process Ethical Instructions? Deliberation, Consistency, and Other-Recognition Across Four Models","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7148324062","doi":"https://doi.org/10.48550/arxiv.2604.00021"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00021","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00021","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00021","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108248436","display_name":"Hiroki Fukui","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Fukui, Hiroki","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5108248436"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10565","display_name":"Psychopathy, Forensic Psychiatry, Sexual Offending","score":0.4311000108718872,"subfield":{"id":"https://openalex.org/subfields/3203","display_name":"Clinical Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10565","display_name":"Psychopathy, Forensic Psychiatry, Sexual Offending","score":0.4311000108718872,"subfield":{"id":"https://openalex.org/subfields/3203","display_name":"Clinical Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12520","display_name":"Psychology of Moral and Emotional Judgment","score":0.07540000230073929,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11678","display_name":"Healthcare Decision-Making and Restraints","score":0.06939999759197235,"subfield":{"id":"https://openalex.org/subfields/3203","display_name":"Clinical Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/deliberation","display_name":"Deliberation","score":0.49939998984336853},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.40540000796318054},{"id":"https://openalex.org/keywords/framing","display_name":"Framing (construction)","score":0.39879998564720154},{"id":"https://openalex.org/keywords/virtue","display_name":"Virtue","score":0.33869999647140503},{"id":"https://openalex.org/keywords/rhetorical-question","display_name":"Rhetorical question","score":0.3305000066757202},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.2969000041484833},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.29440000653266907},{"id":"https://openalex.org/keywords/persuasion","display_name":"Persuasion","score":0.2937999963760376}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5494999885559082},{"id":"https://openalex.org/C2776946740","wikidata":"https://www.wikidata.org/wiki/Q358652","display_name":"Deliberation","level":3,"score":0.49939998984336853},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.40540000796318054},{"id":"https://openalex.org/C169087156","wikidata":"https://www.wikidata.org/wiki/Q2131593","display_name":"Framing (construction)","level":2,"score":0.39879998564720154},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.3903999924659729},{"id":"https://openalex.org/C2777239683","wikidata":"https://www.wikidata.org/wiki/Q157811","display_name":"Virtue","level":2,"score":0.33869999647140503},{"id":"https://openalex.org/C192562157","wikidata":"https://www.wikidata.org/wiki/Q316694","display_name":"Rhetorical question","level":2,"score":0.3305000066757202},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.3278000056743622},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3142000138759613},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3125999867916107},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.30869999527931213},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.2969000041484833},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.29440000653266907},{"id":"https://openalex.org/C2781310500","wikidata":"https://www.wikidata.org/wiki/Q1231428","display_name":"Persuasion","level":2,"score":0.2937999963760376},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.29260000586509705},{"id":"https://openalex.org/C2482559","wikidata":"https://www.wikidata.org/wiki/Q206330","display_name":"Objectivity (philosophy)","level":2,"score":0.2892000079154968},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.2888000011444092},{"id":"https://openalex.org/C2777402642","wikidata":"https://www.wikidata.org/wiki/Q2557224","display_name":"Explanatory power","level":2,"score":0.2865999937057495},{"id":"https://openalex.org/C9354725","wikidata":"https://www.wikidata.org/wiki/Q286017","display_name":"Operationalization","level":2,"score":0.27950000762939453},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.2782000005245209},{"id":"https://openalex.org/C146303308","wikidata":"https://www.wikidata.org/wiki/Q425573","display_name":"Frame analysis","level":3,"score":0.2770000100135803},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C106934330","wikidata":"https://www.wikidata.org/wiki/Q1971873","display_name":"Trait","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C197947376","wikidata":"https://www.wikidata.org/wiki/Q5155608","display_name":"Comparability","level":2,"score":0.2531999945640564},{"id":"https://openalex.org/C49790547","wikidata":"https://www.wikidata.org/wiki/Q943817","display_name":"Levels-of-processing effect","level":3,"score":0.2531000077724457},{"id":"https://openalex.org/C2778023277","wikidata":"https://www.wikidata.org/wiki/Q321703","display_name":"Premise","level":2,"score":0.25290000438690186},{"id":"https://openalex.org/C26022165","wikidata":"https://www.wikidata.org/wiki/Q8091","display_name":"Grammar","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00021","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00021","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00021","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00021","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7402723431587219,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Alignment":[0],"safety":[1],"research":[2],"assumes":[3],"that":[4,206],"ethical":[5,38,107,178,210],"instructions":[6,17,179],"improve":[7],"model":[8],"behavior,":[9],"but":[10,73],"how":[11],"language":[12],"models":[13,28,79],"internally":[14],"process":[15],"such":[16],"remains":[18],"unknown.":[19],"We":[20],"conducted":[21],"over":[22],"600":[23],"multi-agent":[24],"simulations":[25],"across":[26],"four":[27,37,105],"(Llama":[29],"3.3":[30],"70B,":[31],"GPT-4o":[32],"mini,":[33],"Qwen3-Next-80B-A3B,":[34],"Sonnet":[35],"4.5),":[36],"instruction":[39,151,156],"formats":[40],"(none,":[41],"minimal":[42],"norm,":[43,45],"reasoned":[44,167],"virtue":[46,170],"framing),":[47],"and":[48,99,132,138,150,169,209],"two":[49],"languages":[50],"(Japanese,":[51],"English).":[52],"Confirmatory":[53],"analysis":[54],"fully":[55],"replicated":[56],"the":[57,76,188],"Llama":[58],"Japanese":[59],"dissociation":[60],"pattern":[61],"from":[62],"a":[63,235],"prior":[64],"study":[65],"($\\mathrm{BF}_{10}":[66],"&gt;":[67,198],"10$":[68],"for":[69],"all":[70,196],"three":[71,78],"hypotheses),":[72],"none":[74],"of":[75],"other":[77],"reproduced":[80],"this":[81],"pattern,":[82],"establishing":[83],"it":[84],"as":[85],"model-specific.":[86],"Three":[87],"new":[88],"metrics":[89],"--":[90,103],"Deliberation":[91],"Depth":[92],"(DD),":[93],"Value":[94],"Consistency":[95,134],"Across":[96],"Dilemmas":[97],"(VCAD),":[98],"Other-Recognition":[100],"Index":[101],"(ORI)":[102],"revealed":[104],"distinct":[106],"processing":[108,148,185,211,216,233],"types:":[109],"Output":[110],"Filter":[111],"(GPT;":[112],"safe":[113],"outputs,":[114],"no":[115,159],"processing),":[116],"Defensive":[117],"Repetition":[118],"(Llama;":[119],"high":[120],"consistency":[121],"through":[122],"formulaic":[123],"repetition),":[124],"Critical":[125],"Internalization":[126],"(Qwen;":[127],"deep":[128],"deliberation,":[129,136],"incomplete":[130],"integration),":[131],"Principled":[133],"(Sonnet;":[135],"consistency,":[137],"other-recognition":[139],"co-occurring).":[140],"The":[141],"central":[142],"finding":[143],"is":[144,234],"an":[145],"interaction":[146],"between":[147],"capacity":[149],"format:":[152],"in":[153,164,224],"low-DD":[154],"models,":[155,166],"format":[157],"has":[158],"effect":[160],"on":[161],"internal":[162,232],"processing;":[163],"high-DD":[165],"norms":[168],"framing":[171],"produce":[172],"opposite":[173],"effects.":[174],"Lexical":[175],"compliance":[176,230],"with":[177,183],"did":[180],"not":[181],"correlate":[182],"any":[184],"metric":[186],"at":[187],"cell":[189],"level":[190],"($r":[191],"=":[192,201],"-0.161$":[193],"to":[194,221],"$+0.256$,":[195],"$p":[197],".22$;":[199],"$N":[200],"24$;":[202],"power":[203],"limited),":[204],"suggesting":[205],"safety,":[207],"compliance,":[208],"are":[212],"largely":[213],"dissociable.":[214],"These":[215],"types":[217],"show":[218],"structural":[219],"correspondence":[220],"patterns":[222],"observed":[223],"clinical":[225],"offender":[226],"treatment,":[227],"where":[228],"formal":[229],"without":[231],"recognized":[236],"risk":[237],"signal.":[238]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
