{"id":"https://openalex.org/W7138139325","doi":"https://doi.org/10.1609/aaai.v40i39.40584","title":"Causal Reward Adjustment: Mitigating Reward Hacking in External Reasoning via Backdoor Correction","display_name":"Causal Reward Adjustment: Mitigating Reward Hacking in External Reasoning via Backdoor Correction","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138139325","doi":"https://doi.org/10.1609/aaai.v40i39.40584"},"language":"en","primary_location":{"id":"doi:10.1609/aaai.v40i39.40584","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i39.40584","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i39.40584","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111333083","display_name":"Ruike Song","orcid":null},"institutions":[{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruike Song","raw_affiliation_strings":["Institute of Software Chinese Academy of Sciences\nNankai University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Software Chinese Academy of Sciences\nNankai University","institution_ids":["https://openalex.org/I4210128818"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129677282","display_name":"Zeen Song","orcid":null},"institutions":[{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zeen Song","raw_affiliation_strings":["Institute of Software Chinese Academy of Sciences\nUniversity of Chinese Academy of Sciences"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Software Chinese Academy of Sciences\nUniversity of Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129747263","display_name":"Huijie Guo","orcid":null},"institutions":[{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huijie Guo","raw_affiliation_strings":["Institute of Software Chinese Academy of Sciences\nUniversity of Chinese Academy of Sciences"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Software Chinese Academy of Sciences\nUniversity of Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I4210165038"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129739126","display_name":"Wenwen Qiang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenwen Qiang","raw_affiliation_strings":["Institute of Software Chinese Academy of Sciences\nUniversity of Chinese Academy of Sciences"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Software Chinese Academy of Sciences\nUniversity of Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I4210165038"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.31386515,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"39","first_page":"33019","last_page":"33027"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.30379998683929443,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.30379998683929443,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.15929999947547913,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.11630000174045563,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/backdoor","display_name":"Backdoor","score":0.794700026512146},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6486999988555908},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5756999850273132},{"id":"https://openalex.org/keywords/causal-inference","display_name":"Causal inference","score":0.5148000121116638},{"id":"https://openalex.org/keywords/causal-model","display_name":"Causal model","score":0.49300000071525574},{"id":"https://openalex.org/keywords/causal-reasoning","display_name":"Causal reasoning","score":0.4530999958515167},{"id":"https://openalex.org/keywords/hacker","display_name":"Hacker","score":0.42309999465942383}],"concepts":[{"id":"https://openalex.org/C2781045450","wikidata":"https://www.wikidata.org/wiki/Q254569","display_name":"Backdoor","level":2,"score":0.794700026512146},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6486999988555908},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6427000164985657},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6022999882698059},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5756999850273132},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.520799994468689},{"id":"https://openalex.org/C158600405","wikidata":"https://www.wikidata.org/wiki/Q5054566","display_name":"Causal inference","level":2,"score":0.5148000121116638},{"id":"https://openalex.org/C11671645","wikidata":"https://www.wikidata.org/wiki/Q5054567","display_name":"Causal model","level":2,"score":0.49300000071525574},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.4530999958515167},{"id":"https://openalex.org/C86844869","wikidata":"https://www.wikidata.org/wiki/Q2798820","display_name":"Hacker","level":2,"score":0.42309999465942383},{"id":"https://openalex.org/C2778924833","wikidata":"https://www.wikidata.org/wiki/Q7064603","display_name":"Novelty detection","level":3,"score":0.41260001063346863},{"id":"https://openalex.org/C190839683","wikidata":"https://www.wikidata.org/wiki/Q2448197","display_name":"Train","level":2,"score":0.3977999985218048},{"id":"https://openalex.org/C86827895","wikidata":"https://www.wikidata.org/wiki/Q7098582","display_name":"Opportunistic reasoning","level":4,"score":0.3815999925136566},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.35280001163482666},{"id":"https://openalex.org/C2778712577","wikidata":"https://www.wikidata.org/wiki/Q3505966","display_name":"Retraining","level":2,"score":0.33070001006126404},{"id":"https://openalex.org/C163504300","wikidata":"https://www.wikidata.org/wiki/Q2364925","display_name":"Causal structure","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.29249998927116394},{"id":"https://openalex.org/C143661069","wikidata":"https://www.wikidata.org/wiki/Q670713","display_name":"Reward system","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C21563000","wikidata":"https://www.wikidata.org/wiki/Q484511","display_name":"Inductive reasoning","level":2,"score":0.2632000148296356}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1609/aaai.v40i39.40584","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i39.40584","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},{"id":"pmh:oai:ojs.aaai.org:article/40584","is_oa":false,"landing_page_url":"https://ojs.aaai.org/index.php/AAAI/article/view/40584","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"2159-5399","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i39.40584","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i39.40584","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"External":[0],"reasoning":[1,14,88],"systems":[2,26],"combine":[3],"language":[4],"models":[5,9],"with":[6],"process":[7],"reward":[8,30,79,85,119],"(PRMs)":[10],"to":[11,29,46,59,99],"select":[12],"high-quality":[13],"paths":[15,37],"for":[16],"complex":[17],"tasks":[18],"such":[19],"as":[20],"mathematical":[21],"problem":[22],"solving.":[23],"However,":[24],"these":[25],"are":[27,38],"prone":[28],"hacking,":[31],"where":[32],"high-scoring":[33],"but":[34],"logically":[35],"incorrect":[36,47],"assigned":[39],"high":[40],"scores":[41],"by":[42,81,106],"the":[43,60,83,95,127],"PRMs,":[44],"leading":[45],"answers.":[48],"From":[49],"a":[50,75,87],"causal":[51],"inference":[52],"perspective,":[53],"we":[54,69],"attribute":[55],"this":[56],"phenomenon":[57],"primarily":[58],"presence":[61],"of":[62,86],"confounding":[63,105],"semantic":[64],"features.":[65],"To":[66],"address":[67],"it,":[68],"propose":[70],"Causal":[71],"Reward":[72],"Adjustment":[73],"(CRA),":[74],"method":[76],"that":[77,116],"mitigates":[78,118],"hacking":[80,120],"estimating":[82],"true":[84],"path.":[89],"CRA":[90,117],"trains":[91],"sparse":[92],"autoencoders":[93],"on":[94,111],"PRM\u2019s":[96],"internal":[97],"activations":[98],"recover":[100],"interpretable":[101],"features,":[102],"then":[103],"corrects":[104],"using":[107],"backdoor":[108],"adjustment.":[109],"Experiments":[110],"math":[112],"solving":[113],"datasets":[114],"demonstrate":[115],"and":[121],"improves":[122],"final":[123],"accuracy,":[124],"without":[125],"modifying":[126],"policy":[128],"model":[129],"or":[130],"retraining":[131],"PRM.":[132]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-18T00:00:00"}
