{"id":"https://openalex.org/W7148435653","doi":"https://doi.org/10.48550/arxiv.2604.00228","title":"Do Language Models Know When They'll Refuse? Probing Introspective Awareness of Safety Boundaries","display_name":"Do Language Models Know When They'll Refuse? Probing Introspective Awareness of Safety Boundaries","publication_year":2026,"publication_date":"2026-03-31","ids":{"openalex":"https://openalex.org/W7148435653","doi":"https://doi.org/10.48550/arxiv.2604.00228"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00228","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00228","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00228","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132798598","display_name":"Tanay Gondil","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Gondil, Tanay","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5132798598"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.5407999753952026,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.5407999753952026,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10525","display_name":"Human-Automation Interaction and Safety","score":0.054999999701976776,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.03280000016093254,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/introspection","display_name":"Introspection","score":0.8849999904632568},{"id":"https://openalex.org/keywords/sonnet","display_name":"Sonnet","score":0.7177000045776367},{"id":"https://openalex.org/keywords/sensitivity","display_name":"Sensitivity (control systems)","score":0.6055999994277954},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5092999935150146},{"id":"https://openalex.org/keywords/flagging","display_name":"Flagging","score":0.42750000953674316},{"id":"https://openalex.org/keywords/signal","display_name":"SIGNAL (programming language)","score":0.337799996137619}],"concepts":[{"id":"https://openalex.org/C129671850","wikidata":"https://www.wikidata.org/wiki/Q210501","display_name":"Introspection","level":2,"score":0.8849999904632568},{"id":"https://openalex.org/C38721330","wikidata":"https://www.wikidata.org/wiki/Q80056","display_name":"Sonnet","level":3,"score":0.7177000045776367},{"id":"https://openalex.org/C21200559","wikidata":"https://www.wikidata.org/wiki/Q7451068","display_name":"Sensitivity (control systems)","level":2,"score":0.6055999994277954},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5677000284194946},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5092999935150146},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4781000018119812},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4417000114917755},{"id":"https://openalex.org/C2777548347","wikidata":"https://www.wikidata.org/wiki/Q5456937","display_name":"Flagging","level":2,"score":0.42750000953674316},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4059000015258789},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.40450000762939453},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3707999885082245},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.337799996137619},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.3043999969959259},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.2962999939918518},{"id":"https://openalex.org/C2777146004","wikidata":"https://www.wikidata.org/wiki/Q14949826","display_name":"CLARITY","level":2,"score":0.28700000047683716},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.2818000018596649},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.2775999903678894},{"id":"https://openalex.org/C182365436","wikidata":"https://www.wikidata.org/wiki/Q50701","display_name":"Variable (mathematics)","level":2,"score":0.27709999680519104},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.274399995803833},{"id":"https://openalex.org/C151201525","wikidata":"https://www.wikidata.org/wiki/Q177239","display_name":"Limit (mathematics)","level":2,"score":0.26030001044273376}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00228","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00228","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00228","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00228","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,29,72],"are":[3,139],"trained":[4],"to":[5,151],"refuse":[6,17],"harmful":[7],"requests,":[8,46],"but":[9,80,119],"can":[10],"they":[11,15],"accurately":[12],"predict":[13,31],"when":[14],"will":[16],"before":[18],"responding?":[19],"We":[20,87],"investigate":[21],"this":[22],"question":[23],"through":[24],"a":[25,38],"systematic":[26],"study":[27],"where":[28],"first":[30],"their":[32],"refusal":[33,122],"behavior,":[34],"then":[35],"respond":[36],"in":[37,128],"fresh":[39],"context.":[40],"Across":[41],"3754":[42],"datapoints":[43],"spanning":[44],"300":[45],"we":[47,68],"evaluate":[48],"four":[49],"frontier":[50],"models:":[51],"Claude":[52,55,92],"Sonnet":[53,56,99],"4,":[54],"4.5,":[57],"GPT-5.2,":[58],"and":[59,124],"Llama":[60,114],"3.1":[61],"405B.":[62],"Using":[63],"signal":[64],"detection":[65],"theory":[66],"(SDT),":[67],"find":[69],"that":[70],"all":[71],"exhibit":[73],"high":[74,117],"introspective":[75],"sensitivity":[76,81,118],"(d'":[77],"=":[78],"2.4-3.5),":[79],"drops":[82],"substantially":[83],"at":[84],"safety":[85],"boundaries.":[86],"observe":[88],"generational":[89],"improvement":[90],"within":[91],"(Sonnet":[93],"4.5:":[94],"95.7":[95],"percent":[96,156],"accuracy":[97,107,131,157],"vs":[98],"4:":[100],"93.0":[101],"percent),":[102],"while":[103],"GPT-5.2":[104],"shows":[105],"lower":[106,129],"(88.9":[108],"percent)":[109],"with":[110],"more":[111],"variable":[112],"behavior.":[113],"405B":[115],"achieves":[116],"exhibits":[120],"strong":[121],"bias":[123],"poor":[125],"calibration,":[126],"resulting":[127],"overall":[130],"(80.0":[132],"percent).":[133],"Topic-wise":[134],"analysis":[135],"reveals":[136],"weapons-related":[137],"queries":[138],"consistently":[140],"hardest":[141],"for":[142,158,165],"introspection.":[143],"Critically,":[144],"confidence":[145],"scores":[146],"provide":[147],"actionable":[148],"signal:":[149],"restricting":[150],"high-confidence":[152],"predictions":[153],"yields":[154],"98.3":[155],"well-calibrated":[159],"models,":[160],"enabling":[161],"practical":[162],"confidence-based":[163],"routing":[164],"safety-critical":[166],"deployments.":[167]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
