{"id":"https://openalex.org/W7135057308","doi":"https://doi.org/10.48550/arxiv.2603.10938","title":"Safe RLHF Beyond Expectation: Stochastic Dominance for Universal Spectral Risk Control","display_name":"Safe RLHF Beyond Expectation: Stochastic Dominance for Universal Spectral Risk Control","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7135057308","doi":"https://doi.org/10.48550/arxiv.2603.10938"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.10938","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10938","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.10938","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128863444","display_name":"Yaswanth Chittepu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chittepu, Yaswanth","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082186366","display_name":"Ativ Joshi","orcid":"https://orcid.org/0000-0002-2858-0604"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Joshi, Ativ","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128902987","display_name":"Rajarshi Bhattacharjee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhattacharjee, Rajarshi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5043572737","display_name":"Scott Niekum","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Niekum, Scott","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5128863444"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.7264999747276306,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.7264999747276306,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.13779999315738678,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10917","display_name":"Smart Grid Security and Resilience","score":0.01899999938905239,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/stochastic-dominance","display_name":"Stochastic dominance","score":0.6927000284194946},{"id":"https://openalex.org/keywords/weighting","display_name":"Weighting","score":0.6442999839782715},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5566999912261963},{"id":"https://openalex.org/keywords/quantile","display_name":"Quantile","score":0.5080999732017517},{"id":"https://openalex.org/keywords/probability-distribution","display_name":"Probability distribution","score":0.41519999504089355},{"id":"https://openalex.org/keywords/operationalization","display_name":"Operationalization","score":0.40130001306533813}],"concepts":[{"id":"https://openalex.org/C33252445","wikidata":"https://www.wikidata.org/wiki/Q3713315","display_name":"Stochastic dominance","level":2,"score":0.6927000284194946},{"id":"https://openalex.org/C183115368","wikidata":"https://www.wikidata.org/wiki/Q856577","display_name":"Weighting","level":2,"score":0.6442999839782715},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5566999912261963},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.5112000107765198},{"id":"https://openalex.org/C118671147","wikidata":"https://www.wikidata.org/wiki/Q578714","display_name":"Quantile","level":2,"score":0.5080999732017517},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.46299999952316284},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.4242999851703644},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.4196000099182129},{"id":"https://openalex.org/C149441793","wikidata":"https://www.wikidata.org/wiki/Q200726","display_name":"Probability distribution","level":2,"score":0.41519999504089355},{"id":"https://openalex.org/C9354725","wikidata":"https://www.wikidata.org/wiki/Q286017","display_name":"Operationalization","level":2,"score":0.40130001306533813},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.3580999970436096},{"id":"https://openalex.org/C125112378","wikidata":"https://www.wikidata.org/wiki/Q176640","display_name":"Randomness","level":2,"score":0.33709999918937683},{"id":"https://openalex.org/C22029948","wikidata":"https://www.wikidata.org/wiki/Q45089","display_name":"Dice","level":2,"score":0.2883000075817108},{"id":"https://openalex.org/C151913843","wikidata":"https://www.wikidata.org/wiki/Q3454555","display_name":"Dominance (genetics)","level":3,"score":0.27309998869895935},{"id":"https://openalex.org/C89128539","wikidata":"https://www.wikidata.org/wiki/Q1949963","display_name":"Statistic","level":2,"score":0.26019999384880066},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.10938","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10938","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.10938","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10938","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Safe":[0],"Reinforcement":[1],"Learning":[2],"from":[3],"Human":[4],"Feedback":[5],"(RLHF)":[6],"typically":[7],"enforces":[8],"safety":[9],"through":[10],"expected":[11,100],"cost":[12,24,61,101,118],"constraints,":[13],"but":[14],"the":[15,23,115,181,197],"expectation":[16],"captures":[17],"only":[18],"a":[19,55,93,123,140,163,187,192],"single":[20],"statistic":[21],"of":[22,122,166],"distribution":[25,119],"and":[26,47,74,135,142,156,215],"fails":[27],"to":[28,120,138],"account":[29],"for":[30,146,190],"distributional":[31],"uncertainty,":[32],"particularly":[33],"under":[34,174],"heavy":[35],"tails":[36],"or":[37],"rare":[38],"catastrophic":[39],"events.":[40],"This":[41,185],"limitation":[42],"is":[43],"problematic":[44],"when":[45],"robustness":[46,218],"risk":[48,194],"sensitivity":[49],"are":[50],"critical.":[51],"Stochastic":[52,105],"dominance":[53,176],"offers":[54],"principled":[56,188],"alternative":[57],"by":[58,113],"comparing":[59,114],"entire":[60],"distributions":[62],"rather":[63],"than":[64],"just":[65],"their":[66],"averages,":[67],"enabling":[68],"direct":[69],"control":[70],"over":[71,208],"tail":[72],"risks":[73],"potential":[75],"out-of-distribution":[76,220],"failures":[77],"that":[78,97,121,158,172,204],"expectation-based":[79],"constraints":[80,102,155],"may":[81],"overlook.":[82],"In":[83],"this":[84,111],"work,":[85],"we":[86,151],"propose":[87],"Risk-sensitive":[88],"Alignment":[89],"via":[90,196],"Dominance":[91,106],"(RAD),":[92],"novel":[94],"alignment":[95],"framework":[96],"replaces":[98],"scalar":[99],"with":[103],"First-Order":[104],"(FSD)":[107],"constraints.":[108],"We":[109],"operationalize":[110],"constraint":[112],"target":[116],"policy's":[117],"reference":[124],"policy":[125],"within":[126],"an":[127],"Optimal":[128],"Transport":[129],"(OT)":[130],"framework,":[131],"using":[132],"entropic":[133],"regularization":[134],"Sinkhorn":[136],"iterations":[137],"obtain":[139],"differentiable":[141],"computationally":[143],"efficient":[144],"objective":[145],"stable":[147],"end-to-end":[148],"optimization.":[149],"Furthermore,":[150],"introduce":[152],"quantile-weighted":[153],"FSD":[154,160],"show":[157],"weighted":[159,175],"universally":[161],"controls":[162],"broad":[164],"class":[165],"Spectral":[167],"Risk":[168],"Measures":[169],"(SRMs),":[170],"so":[171],"improvements":[173,179],"imply":[177],"guaranteed":[178],"in":[180,213],"corresponding":[182],"spectral":[183],"risk.":[184],"provides":[186],"mechanism":[189],"tuning":[191],"model's":[193],"profile":[195],"quantile":[198],"weighting":[199],"function.":[200],"Empirical":[201],"results":[202],"demonstrate":[203],"RAD":[205],"improves":[206],"harmlessness":[207,221],"baselines":[209],"while":[210],"remaining":[211],"competitive":[212],"helpfulness,":[214],"exhibits":[216],"greater":[217],"on":[219],"evaluations.":[222]},"counts_by_year":[],"updated_date":"2026-03-13T14:25:03.468858","created_date":"2026-03-13T00:00:00"}
