{"id":"https://openalex.org/W7154564540","doi":"https://doi.org/10.48550/arxiv.2604.13833","title":"Robust Reward Modeling for Large Language Models via Causal Decomposition","display_name":"Robust Reward Modeling for Large Language Models via Causal Decomposition","publication_year":2026,"publication_date":"2026-04-15","ids":{"openalex":"https://openalex.org/W7154564540","doi":"https://doi.org/10.48550/arxiv.2604.13833"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.13833","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13833","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.13833","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125149730","display_name":"Yunsheng Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lu, Yunsheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133815924","display_name":"Zijiang Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zijiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124278084","display_name":"Licheng Pan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Licheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133811766","display_name":"Zhixuan Chu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chu, Zhixuan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5125149730"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.5982000231742859,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.5982000231742859,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.0877000018954277,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.05249999836087227,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/overfitting","display_name":"Overfitting","score":0.8639000058174133},{"id":"https://openalex.org/keywords/spurious-relationship","display_name":"Spurious relationship","score":0.7418000102043152},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.699400007724762},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5292999744415283},{"id":"https://openalex.org/keywords/signal","display_name":"SIGNAL (programming language)","score":0.5097000002861023},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.30709999799728394},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.3037000000476837},{"id":"https://openalex.org/keywords/causal-model","display_name":"Causal model","score":0.2980000078678131}],"concepts":[{"id":"https://openalex.org/C22019652","wikidata":"https://www.wikidata.org/wiki/Q331309","display_name":"Overfitting","level":3,"score":0.8639000058174133},{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.7418000102043152},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.699400007724762},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6495000123977661},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5619000196456909},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5292999744415283},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.5097000002861023},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.43479999899864197},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.335099995136261},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3319000005722046},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.30709999799728394},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.3037000000476837},{"id":"https://openalex.org/C11671645","wikidata":"https://www.wikidata.org/wiki/Q5054567","display_name":"Causal model","level":2,"score":0.2980000078678131},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.2980000078678131},{"id":"https://openalex.org/C64357122","wikidata":"https://www.wikidata.org/wiki/Q1149766","display_name":"Causality (physics)","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.27480000257492065},{"id":"https://openalex.org/C2779803651","wikidata":"https://www.wikidata.org/wiki/Q5282088","display_name":"Discriminator","level":3,"score":0.27230000495910645},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.26420000195503235},{"id":"https://openalex.org/C207609745","wikidata":"https://www.wikidata.org/wiki/Q4944086","display_name":"Bootstrapping (finance)","level":2,"score":0.2630000114440918},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C51167844","wikidata":"https://www.wikidata.org/wiki/Q4422623","display_name":"Latent variable","level":2,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.13833","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13833","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.13833","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13833","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reward":[0],"models":[1],"are":[2],"central":[3],"to":[4,13,45,61,77,129,147],"aligning":[5],"large":[6],"language":[7],"models,":[8],"yet":[9],"they":[10],"often":[11],"overfit":[12],"spurious":[14],"cues":[15,29],"such":[16],"as":[17,74],"response":[18],"length":[19],"and":[20,100,107,122,144,149],"overly":[21],"agreeable":[22],"tone.":[23],"Most":[24],"prior":[25],"work":[26],"weakens":[27],"these":[28],"directly":[30],"by":[31],"penalizing":[32],"or":[33],"controlling":[34],"specific":[35],"artifacts,":[36],"but":[37],"it":[38],"does":[39],"not":[40],"explicitly":[41],"encourage":[42],"the":[43,49,62,67,79,103],"model":[44,81],"ground":[46],"preferences":[47],"in":[48,120,153],"prompt's":[50],"intent.":[51],"We":[52,83],"learn":[53],"a":[54,58,75],"decoder":[55,104],"that":[56,87],"maps":[57],"candidate":[59],"answer":[60],"latent":[63],"intent":[64],"embedding":[65],"of":[66],"input.":[68],"The":[69],"reconstruction":[70],"error":[71],"is":[72],"used":[73],"signal":[76,89,116],"regularize":[78],"reward":[80],"training.":[82],"provide":[84],"theoretical":[85],"evidence":[86],"this":[88,115],"emphasizes":[90],"prompt-dependent":[91],"information":[92],"while":[93,140],"suppressing":[94],"prompt-independent":[95],"shortcuts.":[96],"Across":[97],"math,":[98],"helpfulness,":[99],"safety":[101],"benchmarks,":[102],"selects":[105],"shorter":[106,142],"less":[108],"sycophantic":[109],"candidates":[110],"with":[111],"0.877":[112],"accuracy.":[113],"Incorporating":[114],"into":[117],"RM":[118],"training":[119],"Gemma-2-2B-it":[121],"Gemma-2-9B-it":[123],"increases":[124,136],"RewardBench":[125],"accuracy":[126],"from":[127],"0.832":[128],"0.868.":[130],"For":[131],"Best-of-N":[132],"selection,":[133],"our":[134],"framework":[135],"length-controlled":[137],"win":[138],"rates":[139],"producing":[141],"outputs,":[143],"remains":[145],"robust":[146],"lengthening":[148],"mild":[150],"off-topic":[151],"drift":[152],"controlled":[154],"rewrite":[155],"tests.":[156]},"counts_by_year":[],"updated_date":"2026-04-17T06:04:52.305304","created_date":"2026-04-17T00:00:00"}
