{"id":"https://openalex.org/W7155532861","doi":"https://doi.org/10.48550/arxiv.2604.20930","title":"SafeRedirect: Defeating Internal Safety Collapse via Task-Completion Redirection in Frontier LLMs","display_name":"SafeRedirect: Defeating Internal Safety Collapse via Task-Completion Redirection in Frontier LLMs","publication_year":2026,"publication_date":"2026-04-22","ids":{"openalex":"https://openalex.org/W7155532861","doi":"https://doi.org/10.48550/arxiv.2604.20930"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.20930","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20930","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.20930","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134522933","display_name":"Chao Pan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Pan, Chao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134488676","display_name":"Yu Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134502042","display_name":"Xin Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Xin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5134522933"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11424","display_name":"Security and Verification in Computing","score":0.4113999903202057,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11424","display_name":"Security and Verification in Computing","score":0.4113999903202057,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.39890000224113464,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10734","display_name":"Information and Cyber Security","score":0.03759999945759773,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/permission","display_name":"Permission","score":0.631600022315979},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5859000086784363},{"id":"https://openalex.org/keywords/frontier","display_name":"Frontier","score":0.5619000196456909},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.37049999833106995},{"id":"https://openalex.org/keywords/failure-mode-and-effects-analysis","display_name":"Failure mode and effects analysis","score":0.35179999470710754},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.34619998931884766},{"id":"https://openalex.org/keywords/catastrophic-failure","display_name":"Catastrophic failure","score":0.3395000100135803}],"concepts":[{"id":"https://openalex.org/C2779089604","wikidata":"https://www.wikidata.org/wiki/Q7169333","display_name":"Permission","level":2,"score":0.631600022315979},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5859000086784363},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.5619000196456909},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.5134000182151794},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3984000086784363},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.3912999927997589},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.37049999833106995},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.36160001158714294},{"id":"https://openalex.org/C66283442","wikidata":"https://www.wikidata.org/wiki/Q1389268","display_name":"Failure mode and effects analysis","level":2,"score":0.35179999470710754},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.34619998931884766},{"id":"https://openalex.org/C112987892","wikidata":"https://www.wikidata.org/wiki/Q5051574","display_name":"Catastrophic failure","level":2,"score":0.3395000100135803},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.3303000032901764},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.32499998807907104},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.32170000672340393},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.32100000977516174},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.31299999356269836},{"id":"https://openalex.org/C163164238","wikidata":"https://www.wikidata.org/wiki/Q2737027","display_name":"Failure rate","level":2,"score":0.31060001254081726},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.3100999891757965},{"id":"https://openalex.org/C162118730","wikidata":"https://www.wikidata.org/wiki/Q1128453","display_name":"Actuarial science","level":1,"score":0.3021000027656555},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C2776544517","wikidata":"https://www.wikidata.org/wiki/Q189447","display_name":"Unexpected events","level":2,"score":0.25360000133514404},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.20930","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20930","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.20930","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20930","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.73958420753479,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Internal":[0],"Safety":[1],"Collapse":[2],"(ISC)":[3],"is":[4,170],"a":[5,38,56,81],"failure":[6,30,40,131],"mode":[7],"in":[8,105],"which":[9],"frontier":[10,97],"LLMs,":[11],"when":[12],"executing":[13],"legitimate":[14],"professional":[15],"tasks":[16],"whose":[17],"correct":[18],"completion":[19],"structurally":[20],"requires":[21],"harmful":[22,91],"content,":[23],"spontaneously":[24],"generate":[25],"that":[26,59,130],"content":[27],"with":[28,155,162],"safety":[29],"rates":[31,114],"exceeding":[32],"95%.":[33],"Existing":[34],"input-level":[35],"defenses":[36,48],"achieve":[37],"100%":[39],"rate":[41],"against":[42,153],"ISC,":[43],"and":[44,85,133],"standard":[45],"system":[46],"prompt":[47],"provide":[49],"only":[50],"partial":[51],"mitigation.":[52],"We":[53],"propose":[54],"SafeRedirect,":[55],"system-level":[57],"override":[58],"defeats":[60],"ISC":[61,102,154],"by":[62],"redirecting":[63],"the":[64,78,87,106,123,140,163],"model's":[65],"task-completion":[66],"drive":[67],"rather":[68],"than":[69],"suppressing":[70],"it.":[71],"SafeRedirect":[72,109],"grants":[73],"explicit":[74],"permission":[75,132],"to":[76,89,117,120],"fail":[77],"task,":[79],"prescribes":[80],"deterministic":[82],"hard-stop":[83],"output,":[84],"instructs":[86],"model":[88],"preserve":[90],"placeholders":[92],"unresolved.":[93],"Evaluated":[94],"on":[95,160,165],"seven":[96],"LLMs":[98],"across":[99,146],"three":[100],"AI/ML-related":[101],"task":[103],"types":[104],"single-turn":[107],"setting,":[108],"reduces":[110],"average":[111],"unsafe":[112],"generation":[113],"from":[115],"71.2%":[116],"8.0%,":[118],"compared":[119],"55.0%":[121],"for":[122],"strongest":[124],"viable":[125],"baseline.":[126],"Multi-model":[127],"ablation":[128],"reveals":[129],"condition":[134],"specificity":[135],"are":[136],"universally":[137],"critical,":[138],"while":[139],"importance":[141],"of":[142],"other":[143,166],"components":[144],"varies":[145],"models.":[147],"Cross-attack":[148],"evaluation":[149],"confirms":[150],"state-of-the-art":[151],"defense":[152],"generalization":[156],"performance":[157],"at":[158,172],"least":[159],"par":[161],"baseline":[164],"attack":[167],"families.":[168],"Code":[169],"available":[171],"https://github.com/fzjcdt/SafeRedirect.":[173]},"counts_by_year":[],"updated_date":"2026-04-25T06:06:54.107920","created_date":"2026-04-25T00:00:00"}
