{"id":"https://openalex.org/W7140134477","doi":"https://doi.org/10.18653/v1/2026.eacl-long.116","title":"When the Model Said \u2018No Comment\u2019, We Knew Helpfulness Was Dead, Honesty Was Alive, and Safety Was Terrified","display_name":"When the Model Said \u2018No Comment\u2019, We Knew Helpfulness Was Dead, Honesty Was Alive, and Safety Was Terrified","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7140134477","doi":"https://doi.org/10.18653/v1/2026.eacl-long.116"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2026.eacl-long.116","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2026.eacl-long.116","pdf_url":"https://aclanthology.org/2026.eacl-long.116.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2026.eacl-long.116.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Gautam Siddharth Kashyap","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gautam Siddharth Kashyap","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Mark Dras","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mark Dras","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Usman Naseem","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Usman Naseem","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.38690564,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2561","last_page":"2572"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12520","display_name":"Psychology of Moral and Emotional Judgment","score":0.00930000003427267,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T12520","display_name":"Psychology of Moral and Emotional Judgment","score":0.00930000003427267,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T13841","display_name":"Psychology of Social Influence","score":0.008700000122189522,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11997","display_name":"Free Will and Agency","score":0.00800000037997961,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/honesty","display_name":"Honesty","score":0.6060000061988831},{"id":"https://openalex.org/keywords/helpfulness","display_name":"Helpfulness","score":0.484499990940094},{"id":"https://openalex.org/keywords/deception","display_name":"Deception","score":0.32350000739097595},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.271699994802475},{"id":"https://openalex.org/keywords/work","display_name":"Work (physics)","score":0.25200000405311584}],"concepts":[{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.7390000224113464},{"id":"https://openalex.org/C2777293324","wikidata":"https://www.wikidata.org/wiki/Q337349","display_name":"Honesty","level":2,"score":0.6060000061988831},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.5275999903678894},{"id":"https://openalex.org/C2781265381","wikidata":"https://www.wikidata.org/wiki/Q5710255","display_name":"Helpfulness","level":2,"score":0.484499990940094},{"id":"https://openalex.org/C2779267917","wikidata":"https://www.wikidata.org/wiki/Q170028","display_name":"Deception","level":2,"score":0.32350000739097595},{"id":"https://openalex.org/C75630572","wikidata":"https://www.wikidata.org/wiki/Q538904","display_name":"Applied psychology","level":1,"score":0.27489998936653137},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.271699994802475},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.25200000405311584},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.24169999361038208},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.23649999499320984}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2026.eacl-long.116","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2026.eacl-long.116","pdf_url":"https://aclanthology.org/2026.eacl-long.116.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2026.eacl-long.116","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2026.eacl-long.116","pdf_url":"https://aclanthology.org/2026.eacl-long.116.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320315885","display_name":"Australian Government","ror":"https://ror.org/0314h5y94"},{"id":"https://openalex.org/F4320320591","display_name":"Macquarie University","ror":"https://ror.org/01sf06y89"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7140134477.pdf","grobid_xml":"https://content.openalex.org/works/W7140134477.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"need":[4],"to":[5,29,43,88,146],"be":[6],"in":[7,36,129],"accordance":[8],"with":[9,124],"human":[10],"values-being":[11],"helpful,":[12],"harmless,":[13],"and":[14,26,69,107,121,131,139,155],"honest":[15],"(HHH)-is":[16],"important":[17],"for":[18],"safe":[19],"deployment.Existing":[20],"works":[21,33],"use":[22],"Supervised":[23],"Fine-Tuning":[24],"(SFT)":[25],"Mixtureof-Experts":[27],"(MoE)":[28],"align":[30],"LLMs.However,":[31],"these":[32],"face":[34],"challenges":[35],"multi-objective":[37],"settings,":[38],"such":[39],"as":[40],"SFT":[41],"leading":[42],"interference":[44],"between":[45],"conflicting":[46],"objectives,":[47],"while":[48],"MoEs":[49],"suffer":[50],"from":[51,73],"miscalibrated":[52],"routing.We":[53],"term":[54],"this":[55],"failure":[56],"mode":[57],"Axis":[58],"Collapse,":[59],"marked":[60],"by":[61,142],"(1)":[62],"disjoint":[63],"feature":[64],"spaces":[65],"causing":[66],"catastrophic":[67,94],"forgetting,":[68],"(2)":[70],"unreliable":[71],"inference":[72,111],"misrouted":[74],"experts.To":[75],"resolve":[76],"this,":[77],"we":[78],"propose":[79],"AlignX,":[80],"a":[81,98],"two-stage":[82],"framework.Stage":[83],"1":[84],"uses":[85],"prompt-injected":[86],"fine-tuning":[87],"extract":[89],"axis-specific":[90],"task":[91],"features,":[92],"mitigating":[93],"forgetting.Stage":[95],"2":[96],"deploys":[97],"Mo-CaE":[99],"module":[100],"that":[101],"calibrates":[102],"expert":[103],"routing":[104],"using":[105],"fractal":[106],"natural":[108],"geometry,":[109],"improving":[110],"reliability.AlignX":[112],"achieves":[113],"significant":[114],"gains":[115],"on":[116],"Alpaca":[117],"(Helpfulness),":[118],"Beaver-Tails":[119],"(Harmlessness),":[120],"TruthfulQA":[122],"(Honesty),":[123],"+171.5%":[125],"win":[126],"rate,":[127],"+110.1%":[128],"truthfulness-informativeness,":[130],"4.3%":[132],"fewer":[133],"safety":[134],"violations.It":[135],"also":[136],"reduces":[137],"latency":[138],"memory":[140],"usage":[141],"over":[143],"35%":[144],"compared":[145],"prior":[147],"MoEs.Results":[148],"across":[149],"four":[150],"LLMs":[151],"validate":[152],"its":[153],"generalizability.Code":[154],"data":[156],"are":[157],"available":[158],"at:":[159]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-12T00:00:00"}
