{"id":"https://openalex.org/W7162072729","doi":"https://doi.org/10.1145/3786335.3813133","title":"Exploring and Developing a Pre-Model Safeguard with Draft Models","display_name":"Exploring and Developing a Pre-Model Safeguard with Draft Models","publication_year":2026,"publication_date":"2026-05-22","ids":{"openalex":"https://openalex.org/W7162072729","doi":"https://doi.org/10.1145/3786335.3813133"},"language":null,"primary_location":{"id":"doi:10.1145/3786335.3813133","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3786335.3813133","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Conference on AI and Agentic Systems","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3786335.3813133","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136539129","display_name":"Hongyu Cai","orcid":null},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hongyu Cai","raw_affiliation_strings":["Purdue University, West Lafayette, USA"],"raw_orcid":"https://orcid.org/0000-0001-9280-8493","affiliations":[{"raw_affiliation_string":"Purdue University, West Lafayette, USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093013828","display_name":"Arjun Arunasalam","orcid":"https://orcid.org/0009-0001-1631-6064"},"institutions":[{"id":"https://openalex.org/I19700959","display_name":"Florida International University","ror":"https://ror.org/02gz6gg07","country_code":"US","type":"education","lineage":["https://openalex.org/I19700959"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Arjun Arunasalam","raw_affiliation_strings":["Florida International University, Miami, USA"],"raw_orcid":"https://orcid.org/0009-0001-1631-6064","affiliations":[{"raw_affiliation_string":"Florida International University, Miami, USA","institution_ids":["https://openalex.org/I19700959"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064871355","display_name":"Yiming Liang","orcid":"https://orcid.org/0000-0001-9193-4789"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yiming Liang","raw_affiliation_strings":["Purdue University, West Lafayette, USA"],"raw_orcid":"https://orcid.org/0009-0002-6071-5211","affiliations":[{"raw_affiliation_string":"Purdue University, West Lafayette, USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121542937","display_name":"Antonio Bianchi","orcid":null},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Antonio Bianchi","raw_affiliation_strings":["Purdue University, West Lafayette, USA"],"raw_orcid":"https://orcid.org/0000-0002-2862-5286","affiliations":[{"raw_affiliation_string":"Purdue University, West Lafayette, USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5135391333","display_name":"Z. Berkay Celik","orcid":null},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Z. Berkay Celik","raw_affiliation_strings":["Purdue University, West Lafayette, USA"],"raw_orcid":"https://orcid.org/0000-0001-7362-8905","affiliations":[{"raw_affiliation_string":"Purdue University, West Lafayette, USA","institution_ids":["https://openalex.org/I219193219"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.75782784,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"839","last_page":"854"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.2621000111103058,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.2621000111103058,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2387000024318695,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.052299998700618744,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/safeguard","display_name":"Safeguard","score":0.8593999743461609},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.6995000243186951},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6126000285148621},{"id":"https://openalex.org/keywords/transferability","display_name":"Transferability","score":0.5281999707221985},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5228999853134155},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.4253000020980835}],"concepts":[{"id":"https://openalex.org/C2780771206","wikidata":"https://www.wikidata.org/wiki/Q3271761","display_name":"Safeguard","level":2,"score":0.8593999743461609},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.6995000243186951},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.6238999962806702},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6126000285148621},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6075999736785889},{"id":"https://openalex.org/C61272859","wikidata":"https://www.wikidata.org/wiki/Q7834031","display_name":"Transferability","level":3,"score":0.5281999707221985},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5228999853134155},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.48350000381469727},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.4253000020980835},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3675000071525574},{"id":"https://openalex.org/C140547941","wikidata":"https://www.wikidata.org/wiki/Q7797194","display_name":"Threat model","level":2,"score":0.32100000977516174},{"id":"https://openalex.org/C33276779","wikidata":"https://www.wikidata.org/wiki/Q1943363","display_name":"Design elements and principles","level":2,"score":0.28929999470710754},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.2542000114917755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3786335.3813133","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3786335.3813133","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Conference on AI and Agentic Systems","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3786335.3813133","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3786335.3813133","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Conference on AI and Agentic Systems","raw_type":"proceedings-article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.8285943269729614}],"awards":[{"id":"https://openalex.org/G6360409883","display_name":"AI Institute for Agent-based Cyber Threat Intelligence and Operation","funder_award_id":"2229876","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W2587741066","https://openalex.org/W2979826702","https://openalex.org/W3170572542","https://openalex.org/W4400484590","https://openalex.org/W4402667010","https://openalex.org/W4402671828","https://openalex.org/W4404782026"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Model":[2],"(LLM)":[3],"alignment":[4],"remains":[5],"vulnerable":[6],"to":[7,38,99,119,168,171,188,207,230,234,253],"jailbreak":[8,43,97,114,158,245],"attacks":[9,44,98],"that":[10,92,139,213],"elicit":[11],"unsafe":[12],"responses,":[13],"motivating":[14],"pre-model":[15,221,235],"and":[16,58,74,201,223,268,293],"post-model":[17,231,254],"guards.":[18,232],"Pre-model":[19],"guards":[20,48,206,222],"audit":[21],"the":[22,34,55,59,94,146,198,217,241,260,282],"safety":[23,102,147],"of":[24,96,113,149,192,220,244,250,266,288,300,307],"prompts":[25,246,287],"before":[26,103],"invoking":[27],"target":[28,60,81,104,153],"models.":[29],"However,":[30,63],"relying":[31],"solely":[32],"on":[33,134,177,285],"prompt":[35,57,101,159,200],"often":[36],"leads":[37],"high":[39,67],"false-negative":[40,218,242,261],"rates":[41],"(i.e.,":[42],"go":[45],"undetected).":[46],"Post-model":[47],"address":[49],"this":[50,85,178,214],"issue":[51],"by":[52,247,263,272],"auditing":[53],"both":[54,291],"user":[56],"model\u2019s":[61],"response.":[62,175],"they":[64,78],"incur":[65],"a":[66,89,110,157,190,225,296],"computational":[68],"cost,":[69],"including":[70],"increased":[71],"token":[72],"usage":[73],"processing":[75],"time,":[76],"because":[77],"operate":[79],"after":[80],"model":[82,105],"inference.":[83,106],"In":[84],"paper,":[86],"we":[87,127,137],"introduce":[88],"safeguard":[90,181,238,257,279],"design":[91,182,215,239,258,280],"leverages":[93,183],"transferability":[95],"enforce":[100],"We":[107,211],"first":[108],"conduct":[109],"systematic":[111],"study":[112],"transferability,":[115],"particularly":[116],"from":[117,141,151],"LLMs":[118],"small":[120],"language":[121],"models":[122,144],"(SLMs).":[123],"Through":[124],"these":[125,135,202],"experiments,":[126],"identify":[128],"key":[129],"factors":[130],"influencing":[131],"transferability.":[132],"Building":[133],"insights,":[136],"observe":[138],"responses":[140],"smaller":[142],"draft":[143,193],"reflect":[145],"implications":[148],"those":[150],"large":[152],"models;":[154],"i.e.,":[155],"given":[156],"constructed":[160],"for":[161],"an":[162,164,173,248,264],"LLM,":[163],"SLM":[165],"is":[166],"likely":[167],"be":[169],"triggered":[170],"generate":[172,189],"unaligned":[174],"Based":[176],"observation,":[179],"our":[180,237,256,278],"speculative":[184],"inference":[185],"with":[186,295],"SLMs":[187],"set":[191],"responses.":[194],"It":[195],"then":[196],"feeds":[197],"original":[199],"drafts":[203],"into":[204],"existing":[205],"predict":[208],"their":[209],"safety.":[210],"demonstrate":[212],"reduces":[216,240,259,269],"rate":[219,243,262],"offers":[224],"low":[226],"prompt-to-response":[227,270],"time":[228,271],"alternative":[229],"Compared":[233],"guards,":[236,255],"average":[249,265],"32.4%.":[251],"Relative":[252],"17.38%":[267],"97.07%":[273],"(Llama-3-70B-Instruct-AWQ).":[274],"For":[275],"benign":[276,286],"prompts,":[277],"achieves":[281],"same":[283],"accuracy":[284],"98%":[289],"as":[290],"pre-":[292],"post-guards,":[294],"minimal":[297],"latency":[298],"increase":[299],"0.59%.":[301],"Notice:":[302],"This":[303],"paper":[304],"contains":[305],"examples":[306],"harmful":[308],"language.":[309]},"counts_by_year":[],"updated_date":"2026-06-22T08:00:12.763002","created_date":"2026-05-23T00:00:00"}
