{"id":"https://openalex.org/W7147599714","doi":"https://doi.org/10.48550/arxiv.2603.26846","title":"Stable Reasoning, Unstable Responses: Mitigating LLM Deception via Stability Asymmetry","display_name":"Stable Reasoning, Unstable Responses: Mitigating LLM Deception via Stability Asymmetry","publication_year":2026,"publication_date":"2026-03-27","ids":{"openalex":"https://openalex.org/W7147599714","doi":"https://doi.org/10.48550/arxiv.2603.26846"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.26846","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26846","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.26846","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132597145","display_name":"Guoxi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Guoxi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132690293","display_name":"Jiawei Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jiawei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128222875","display_name":"Tianzhuo Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Tianzhuo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132673904","display_name":"Lang Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Lang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132639997","display_name":"Juntao Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Juntao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132579717","display_name":"Yaodong Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yaodong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5107829323","display_name":"Jingwei Yi","orcid":"https://orcid.org/0009-0001-2786-6395"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi, Jingwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5132597145"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.1995999962091446,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.1995999962091446,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.1818999946117401,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11147","display_name":"Misinformation and Its Impacts","score":0.08649999648332596,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/asymmetry","display_name":"Asymmetry","score":0.6894999742507935},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.6007999777793884},{"id":"https://openalex.org/keywords/deception","display_name":"Deception","score":0.4828000068664551},{"id":"https://openalex.org/keywords/rendering","display_name":"Rendering (computer graphics)","score":0.4512999951839447},{"id":"https://openalex.org/keywords/trustworthiness","display_name":"Trustworthiness","score":0.3986999988555908}],"concepts":[{"id":"https://openalex.org/C38976095","wikidata":"https://www.wikidata.org/wiki/Q752641","display_name":"Asymmetry","level":2,"score":0.6894999742507935},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6075999736785889},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.6007999777793884},{"id":"https://openalex.org/C2779267917","wikidata":"https://www.wikidata.org/wiki/Q170028","display_name":"Deception","level":2,"score":0.4828000068664551},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.4512999951839447},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41260001063346863},{"id":"https://openalex.org/C153701036","wikidata":"https://www.wikidata.org/wiki/Q659974","display_name":"Trustworthiness","level":2,"score":0.3986999988555908},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.3269999921321869},{"id":"https://openalex.org/C28901747","wikidata":"https://www.wikidata.org/wiki/Q177571","display_name":"Decision theory","level":2,"score":0.31200000643730164},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3027999997138977},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.25589999556541443}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.26846","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26846","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.26846","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26846","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.7187487483024597,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"Large":[1],"Language":[2],"Models":[3],"(LLMs)":[4],"expand":[5],"in":[6,60,74],"capability":[7],"and":[8,91,102,159],"application":[9],"scope,":[10],"their":[11,28],"trustworthiness":[12],"becomes":[13],"critical.":[14],"A":[15],"vital":[16],"risk":[17],"is":[18],"intrinsic":[19,164],"deception,":[20],"wherein":[21],"models":[22,47],"strategically":[23],"mislead":[24],"users":[25],"to":[26,50,146],"achieve":[27],"own":[29],"objectives.":[30],"Existing":[31],"alignment":[32,122],"approaches":[33],"based":[34],"on":[35,109],"chain-of-thought":[36],"(CoT)":[37],"monitoring":[38],"supervise":[39],"explicit":[40],"reasoning":[41],"traces.":[42],"However,":[43],"under":[44,83,106],"optimization":[45],"pressure,":[46],"are":[48],"incentivized":[49],"conceal":[51],"deceptive":[52,67,157],"reasoning,":[53],"rendering":[54,143],"semantic":[55,147],"supervision":[56],"fundamentally":[57],"unreliable.":[58],"Grounded":[59],"cognitive":[61],"psychology,":[62],"we":[63,113],"hypothesize":[64],"that":[65,124,152,160],"a":[66,70,120],"LLM":[68],"maintains":[69],"stable":[71],"internal":[72,99],"belief":[73],"its":[75,78],"CoT":[76,100,133],"while":[77],"external":[79,103],"response":[80,104],"remains":[81],"fragile":[82],"perturbation.":[84,107],"We":[85],"term":[86],"this":[87,110,126],"phenomenon":[88],"stability":[89,101,105,153],"asymmetry":[90,128,154],"quantify":[92],"it":[93,144],"by":[94],"measuring":[95],"the":[96,115,137],"contrast":[97],"between":[98],"Building":[108],"structural":[111],"signature,":[112],"propose":[114],"Stability":[116],"Asymmetry":[117],"Regularization":[118],"(SAR),":[119],"novel":[121],"objective":[123],"penalizes":[125],"distributional":[127],"during":[129],"reinforcement":[130],"learning.":[131],"Unlike":[132],"monitoring,":[134],"SAR":[135,161],"targets":[136],"statistical":[138],"structure":[139],"of":[140],"model":[141,169],"outputs,":[142],"robust":[145],"concealment.":[148],"Extensive":[149],"experiments":[150],"confirm":[151],"reliably":[155],"identifies":[156],"behavior,":[158],"effectively":[162],"suppresses":[163],"deception":[165],"without":[166],"degrading":[167],"general":[168],"capability.":[170]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-02T00:00:00"}
