{"id":"https://openalex.org/W7161823375","doi":"https://doi.org/10.48550/arxiv.2605.19321","title":"Exploring and Developing a Pre-Model Safeguard with Draft Models","display_name":"Exploring and Developing a Pre-Model Safeguard with Draft Models","publication_year":2026,"publication_date":"2026-05-19","ids":{"openalex":"https://openalex.org/W7161823375","doi":"https://doi.org/10.48550/arxiv.2605.19321"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.19321","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.19321","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.19321","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136539129","display_name":"Hongyu Cai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Hongyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093013828","display_name":"Arjun Arunasalam","orcid":"https://orcid.org/0009-0001-1631-6064"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Arunasalam, Arjun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136606978","display_name":"Yiming Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Yiming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121542937","display_name":"Antonio Bianchi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bianchi, Antonio","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135391333","display_name":"Z. Berkay Celik","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Celik, Z. Berkay","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.5612999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.5612999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.09549999982118607,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.09160000085830688,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/safeguard","display_name":"Safeguard","score":0.7961999773979187},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.6929000020027161},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5972999930381775},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5626000165939331},{"id":"https://openalex.org/keywords/transferability","display_name":"Transferability","score":0.491100013256073},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.41690000891685486}],"concepts":[{"id":"https://openalex.org/C2780771206","wikidata":"https://www.wikidata.org/wiki/Q3271761","display_name":"Safeguard","level":2,"score":0.7961999773979187},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.6929000020027161},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.6513000130653381},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.611299991607666},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5972999930381775},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5626000165939331},{"id":"https://openalex.org/C61272859","wikidata":"https://www.wikidata.org/wiki/Q7834031","display_name":"Transferability","level":3,"score":0.491100013256073},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.48429998755455017},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.41690000891685486},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3603000044822693},{"id":"https://openalex.org/C140547941","wikidata":"https://www.wikidata.org/wiki/Q7797194","display_name":"Threat model","level":2,"score":0.3499000072479248},{"id":"https://openalex.org/C108827166","wikidata":"https://www.wikidata.org/wiki/Q175975","display_name":"Internet privacy","level":1,"score":0.2736999988555908},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.25850000977516174},{"id":"https://openalex.org/C2779777834","wikidata":"https://www.wikidata.org/wiki/Q4202277","display_name":"Enforcement","level":2,"score":0.2502000033855438}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.19321","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.19321","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.19321","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.19321","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.8357987403869629}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Model":[2],"(LLM)":[3],"alignment":[4],"remains":[5],"vulnerable":[6],"to":[7,38,99,119,168,171,188,207,229],"jailbreak":[8,43,97,114,158],"attacks":[9,44,98],"that":[10,92,139,213],"elicit":[11],"unsafe":[12],"responses,":[13],"motivating":[14],"pre-model":[15,221],"and":[16,58,74,201,223],"post-model":[17,230],"guards.":[18,231],"Pre-model":[19],"guards":[20,48,206,222],"audit":[21],"the":[22,34,55,59,94,146,198,217],"safety":[23,102,147],"of":[24,96,113,149,192,220,238],"prompts":[25],"before":[26,103],"invoking":[27],"target":[28,60,81,104,153],"models.":[29],"However,":[30,63],"relying":[31],"solely":[32],"on":[33,134,177],"prompt":[35,57,101,159,200],"often":[36],"leads":[37],"high":[39,67],"false-negative":[40,218],"rates":[41],"(i.e.,":[42],"go":[45],"undetected).":[46],"Post-model":[47],"address":[49],"this":[50,85,178,214],"issue":[51],"by":[52],"auditing":[53],"both":[54],"user":[56],"model's":[61],"response.":[62,175],"they":[64,78],"incur":[65],"a":[66,89,110,157,190,225],"computational":[68],"cost,":[69],"including":[70],"increased":[71],"token":[72],"usage":[73],"processing":[75],"time,":[76],"because":[77],"operate":[79],"after":[80],"model":[82,105],"inference.":[83,106],"In":[84],"paper,":[86],"we":[87,127,137],"introduce":[88],"safeguard":[90,181],"design":[91,182,215],"leverages":[93,183],"transferability":[95],"enforce":[100],"We":[107,211],"first":[108],"conduct":[109],"systematic":[111],"study":[112],"transferability,":[115],"particularly":[116],"from":[117,141,151],"LLMs":[118],"small":[120],"language":[121],"models":[122,144],"(SLMs).":[123],"Through":[124],"these":[125,135,202],"experiments,":[126],"identify":[128],"key":[129],"factors":[130],"influencing":[131],"transferability.":[132],"Building":[133],"insights,":[136],"observe":[138],"responses":[140],"smaller":[142],"draft":[143,193],"reflect":[145],"implications":[148],"those":[150],"large":[152],"models;":[154],"\\ie":[155],"given":[156],"constructed":[160],"for":[161],"an":[162,164,173],"LLM,":[163],"SLM":[165],"is":[166],"likely":[167],"be":[169],"triggered":[170],"generate":[172,189],"unaligned":[174],"Based":[176],"observation,":[179],"our":[180],"speculative":[184],"inference":[185],"with":[186],"SLMs":[187],"set":[191],"responses.":[194],"It":[195],"then":[196],"feeds":[197],"original":[199],"drafts":[203],"into":[204],"existing":[205],"predict":[208],"their":[209],"safety.":[210],"demonstrate":[212],"reduces":[216],"rate":[219],"offers":[224],"low":[226],"\\Efficiency":[227],"alternative":[228],"\\textcolor{red}{\\bf":[232],"Notice:":[233],"This":[234],"paper":[235],"contains":[236],"examples":[237],"harmful":[239],"language.}":[240]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-21T00:00:00"}
