{"id":"https://openalex.org/W7161006585","doi":"https://doi.org/10.48550/arxiv.2605.11664","title":"Safety Context Injection: Inference-Time Safety Alignment via Static Filtering and Agentic Analysis","display_name":"Safety Context Injection: Inference-Time Safety Alignment via Static Filtering and Agentic Analysis","publication_year":2026,"publication_date":"2026-05-12","ids":{"openalex":"https://openalex.org/W7161006585","doi":"https://doi.org/10.48550/arxiv.2605.11664"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.11664","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11664","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.11664","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101950132","display_name":"Zhenhao Xu","orcid":"https://orcid.org/0000-0002-0276-2805"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Zhenhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136016605","display_name":"Wenhan Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chang, Wenhan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059583546","display_name":"Yichuan Chen","orcid":"https://orcid.org/0009-0009-2034-7267"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yichuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136066617","display_name":"Yuxin Fang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fang, Yuxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136069300","display_name":"Junhao Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Junhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136080283","display_name":"Tianqing Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Tianqing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9793000221252441,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9793000221252441,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10917","display_name":"Smart Grid Security and Resilience","score":0.002899999963119626,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13295","display_name":"Safety Systems Engineering in Autonomy","score":0.002899999963119626,"subfield":{"id":"https://openalex.org/subfields/2213","display_name":"Safety, Risk, Reliability and Quality"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/guard","display_name":"Guard (computer science)","score":0.6832000017166138},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6335999965667725},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6323999762535095},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.579800009727478},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.5314000248908997},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.4668000042438507},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.42969998717308044},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.39629998803138733}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7555000185966492},{"id":"https://openalex.org/C141141315","wikidata":"https://www.wikidata.org/wiki/Q2379942","display_name":"Guard (computer science)","level":2,"score":0.6832000017166138},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6335999965667725},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6323999762535095},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.579800009727478},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.5314000248908997},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.4668000042438507},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.42969998717308044},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.39629998803138733},{"id":"https://openalex.org/C132835097","wikidata":"https://www.wikidata.org/wiki/Q7663745","display_name":"System safety","level":2,"score":0.37770000100135803},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3756999969482422},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.34790000319480896},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.33570000529289246},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.33219999074935913},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.32019999623298645},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3154999911785126},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.29190000891685486},{"id":"https://openalex.org/C97686452","wikidata":"https://www.wikidata.org/wiki/Q7604153","display_name":"Static analysis","level":2,"score":0.2863999903202057},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.26989999413490295},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.2574000060558319}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.11664","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11664","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.11664","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11664","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Reasoning":[1],"Models":[2],"(LRMs)":[3],"improve":[4],"performance":[5],"on":[6],"complex":[7],"tasks,":[8],"but":[9,86],"they":[10],"also":[11],"make":[12],"safety":[13,51,108,122],"control":[14],"harder":[15],"at":[16,31],"deployment":[17],"time.":[18,33],"In":[19],"black-box":[20],"settings,":[21],"defenders":[22],"cannot":[23],"modify":[24],"model":[25,81],"weights":[26],"and":[27,57,113,147,158,168,172,185],"must":[28],"instead":[29],"intervene":[30],"inference":[32],"This":[34],"setting":[35],"creates":[36],"three":[37],"practical":[38],"challenges:":[39],"harmful":[40,203],"intent":[41,204],"may":[42],"be":[43],"hidden":[44],"by":[45],"educational":[46],"or":[47,163,208],"role-play":[48],"framing,":[49],"deep":[50],"analysis":[52],"can":[53,61,73],"introduce":[54],"non-trivial":[55],"latency,":[56],"long":[58,211],"adversarial":[59],"contexts":[60],"dilute":[62],"the":[63,80,125,188],"local":[64],"cues":[65],"that":[66,106,155],"simpler":[67],"filters":[68],"rely":[69],"on.":[70],"These":[71],"challenges":[72],"expose":[74],"an":[75,89,103,152,193],"apparent":[76],"thinking--output":[77],"gap,":[78],"where":[79],"appears":[82],"cautious":[83],"during":[84],"reasoning":[85,173],"still":[87],"produces":[88],"unsafe":[90],"final":[91],"answer.":[92],"To":[93],"address":[94],"this":[95],"problem,":[96],"we":[97],"propose":[98],"Safety":[99],"Context":[100],"Injection":[101],"(SCI),":[102],"inference-time":[104],"framework":[105,129],"separates":[107],"assessment":[109],"from":[110],"task":[111],"generation":[112],"prepends":[114],"a":[115,140],"structured":[116],"external":[117],"risk":[118],"report":[119],"as":[120],"injected":[121],"context":[123],"for":[124,144,161],"protected":[126],"model.":[127],"The":[128],"is":[130,199,205],"instantiated":[131],"in":[132,187],"two":[133],"complementary":[134],"variants:":[135],"Static":[136],"Model":[137],"Filtering":[138,150],"(SMF),":[139],"lightweight":[141],"one-pass":[142],"guard":[143],"fast":[145],"deployment,":[146],"Dynamic":[148],"Agents":[149],"(DAF),":[151],"agentic-loop-based":[153],"analyzer":[154],"iteratively":[156],"gathers":[157],"synthesizes":[159],"evidence":[160],"ambiguous":[162],"long-context":[164],"attacks.":[165],"Across":[166],"AdvBench":[167],"GPTFuzz,":[169],"spanning":[170],"base":[171],"models":[174],"under":[175],"five":[176],"jailbreak":[177],"families,":[178],"both":[179],"variants":[180],"reduce":[181],"attack":[182],"success":[183],"rate":[184],"toxicity":[186],"evaluated":[189],"settings.":[190],"SMF":[191],"offers":[192],"efficient":[194],"low-latency":[195],"option,":[196],"while":[197],"DAF":[198],"more":[200],"effective":[201],"when":[202],"semantically":[206],"disguised":[207],"dispersed":[209],"across":[210],"contexts.":[212]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-14T00:00:00"}
