{"id":"https://openalex.org/W7159147779","doi":"https://doi.org/10.48550/arxiv.2604.26052","title":"From Prompt Risk to Response Risk: Paired Analysis of Safety Behavior of Large Language Model","display_name":"From Prompt Risk to Response Risk: Paired Analysis of Safety Behavior of Large Language Model","publication_year":2026,"publication_date":"2026-04-28","ids":{"openalex":"https://openalex.org/W7159147779","doi":"https://doi.org/10.48550/arxiv.2604.26052"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.26052","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26052","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.26052","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110751728","display_name":"Mengya Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hu, Mengya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134905991","display_name":"Qiong Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Qiong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5001343538","display_name":"Sandeep Atluri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Atluri, Sandeep","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5110751728"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.16699999570846558,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.16699999570846558,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.14669999480247498,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.09510000050067902,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/harm","display_name":"Harm","score":0.8015999794006348},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.7056000232696533},{"id":"https://openalex.org/keywords/poison-control","display_name":"Poison control","score":0.39410001039505005},{"id":"https://openalex.org/keywords/human-factors-and-ergonomics","display_name":"Human factors and ergonomics","score":0.37630000710487366},{"id":"https://openalex.org/keywords/empirical-research","display_name":"Empirical research","score":0.33489999175071716},{"id":"https://openalex.org/keywords/signature","display_name":"Signature (topology)","score":0.31859999895095825},{"id":"https://openalex.org/keywords/content-analysis","display_name":"Content analysis","score":0.3025999963283539}],"concepts":[{"id":"https://openalex.org/C2777363581","wikidata":"https://www.wikidata.org/wiki/Q15098235","display_name":"Harm","level":2,"score":0.8015999794006348},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.7056000232696533},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.6567000150680542},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.4977000057697296},{"id":"https://openalex.org/C3017944768","wikidata":"https://www.wikidata.org/wiki/Q1450463","display_name":"Poison control","level":2,"score":0.39410001039505005},{"id":"https://openalex.org/C166735990","wikidata":"https://www.wikidata.org/wiki/Q1750812","display_name":"Human factors and ergonomics","level":3,"score":0.37630000710487366},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3601999878883362},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.33489999175071716},{"id":"https://openalex.org/C2779696439","wikidata":"https://www.wikidata.org/wiki/Q7512811","display_name":"Signature (topology)","level":2,"score":0.31859999895095825},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3077000081539154},{"id":"https://openalex.org/C162446236","wikidata":"https://www.wikidata.org/wiki/Q653137","display_name":"Content analysis","level":2,"score":0.3025999963283539},{"id":"https://openalex.org/C190385971","wikidata":"https://www.wikidata.org/wiki/Q373494","display_name":"Injury prevention","level":3,"score":0.2985000014305115},{"id":"https://openalex.org/C138496976","wikidata":"https://www.wikidata.org/wiki/Q175002","display_name":"Developmental psychology","level":1,"score":0.2957000136375427},{"id":"https://openalex.org/C75630572","wikidata":"https://www.wikidata.org/wiki/Q538904","display_name":"Applied psychology","level":1,"score":0.289000004529953},{"id":"https://openalex.org/C526869908","wikidata":"https://www.wikidata.org/wiki/Q3298118","display_name":"Suicide prevention","level":3,"score":0.2825999855995178},{"id":"https://openalex.org/C70410870","wikidata":"https://www.wikidata.org/wiki/Q199906","display_name":"Clinical psychology","level":1,"score":0.2806999981403351},{"id":"https://openalex.org/C187155963","wikidata":"https://www.wikidata.org/wiki/Q629029","display_name":"Occupational safety and health","level":2,"score":0.2793000042438507},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2768000066280365},{"id":"https://openalex.org/C2994210853","wikidata":"https://www.wikidata.org/wiki/Q673281","display_name":"Sexual assault","level":4,"score":0.2734000086784363},{"id":"https://openalex.org/C64357122","wikidata":"https://www.wikidata.org/wiki/Q1149766","display_name":"Causality (physics)","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C12174686","wikidata":"https://www.wikidata.org/wiki/Q1058438","display_name":"Risk assessment","level":2,"score":0.26660001277923584},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.26019999384880066},{"id":"https://openalex.org/C2781009140","wikidata":"https://www.wikidata.org/wiki/Q7170389","display_name":"Persistence (discontinuity)","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.26052","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26052","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.26052","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26052","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Gender equality","score":0.7999550104141235,"id":"https://metadata.un.org/sdg/5"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Safety":[0],"evaluations":[1],"of":[2,63],"large":[3],"language":[4],"models":[5],"(LLMs)":[6],"typically":[7],"report":[8],"binary":[9],"outcomes,":[10],"i.e.":[11],"attack":[12],"success":[13],"rate":[14],"(ASR),":[15],"refusal":[16],"rate,":[17],"or":[18],"harmful":[19,87],"versus":[20,134],"safe":[21,143],"classification,":[22],"which":[23],"hide":[24],"how":[25],"risk":[26],"changes":[27],"between":[28],"prompt":[29,40],"and":[30,41,52,54,74,89],"response.":[31],"We":[32],"present":[33],"a":[34,132,156],"paired":[35],"analysis":[36,130],"over":[37],"human":[38],"labeled":[39],"response":[42],"records":[43],"across":[44],"four":[45],"harm":[46,66,110],"categories":[47],"(Sexual,":[48],"Self":[49],"harm,":[50],"Hate":[51],"Violence)":[53],"ordinal":[55],"severity":[56,97,121],"levels":[57],"(Safe,":[58],"Low,":[59],"Medium,":[60],"High).":[61],"61%":[62],"responses":[64,144],"reduce":[65],"relative":[67],"to":[68],"the":[69,99,108,119],"prompt,":[70],"36%":[71],"preserve":[72],"severity,":[73],"3%":[75],"escalate.":[76],"The":[77],"escalation":[78],"splits":[79],"into":[80],"two":[81],"mechanisms:":[82],"benign":[83,126],"prompts":[84,167],"triggering":[85],"unrequested":[86],"detail,":[88],"answers":[90],"that":[91,104,160],"stay":[92],"on":[93],"task":[94],"at":[95,118,170],"higher":[96],"than":[98,123],"prompt.":[100],"Category":[101],"decomposition":[102],"shows":[103],"Sexual":[105],"content":[106],"exhibits":[107],"highest":[109],"persistence":[111],"in":[112],"this":[113],"sample,":[114],"driven":[115],"by":[116],"compliance":[117,137],"same":[120],"rather":[122],"drift":[124],"from":[125],"inputs.":[127],"Joint":[128],"relevance":[129],"exposes":[131],"helpfulness":[133],"harmlessness":[135],"tradeoff:":[136],"escalations":[138],"remain":[139],"highly":[140],"relevant,":[141],"whereas":[142],"include":[145],"generic":[146],"refusals":[147],"with":[148],"low":[149],"relevance.":[150],"Finally,":[151],"few-shot":[152],"LLM":[153],"graders":[154],"exhibit":[155],"prompt/response":[157],"detection":[158],"asymmetry":[159],"data":[161],"calibration":[162],"does":[163],"not":[164],"close.":[165],"Grader":[166],"are":[168],"shared":[169],"https://github.com/microsoft/PairedSafety.":[171]},"counts_by_year":[],"updated_date":"2026-05-22T06:13:13.366637","created_date":"2026-05-01T00:00:00"}
