{"id":"https://openalex.org/W7133845266","doi":"https://doi.org/10.48550/arxiv.2603.04069","title":"Monitoring Emergent Reward Hacking During Generation via Internal Activations","display_name":"Monitoring Emergent Reward Hacking During Generation via Internal Activations","publication_year":2026,"publication_date":"2026-03-04","ids":{"openalex":"https://openalex.org/W7133845266","doi":"https://doi.org/10.48550/arxiv.2603.04069"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.04069","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5064105629","display_name":"Pascal Wilhelm","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wilhelm, Patrick","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086364217","display_name":"Thorsten Wittkopp","orcid":"https://orcid.org/0000-0001-5154-7813"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wittkopp, Thorsten","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5119909913","display_name":"Odej Kao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kao, Odej","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.18930000066757202,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.18930000066757202,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.08799999952316284,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.0851999968290329,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/hacker","display_name":"Hacker","score":0.487199991941452},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.4117000102996826},{"id":"https://openalex.org/keywords/internal-model","display_name":"Internal model","score":0.40790000557899475},{"id":"https://openalex.org/keywords/signal","display_name":"SIGNAL (programming language)","score":0.4011000096797943},{"id":"https://openalex.org/keywords/train","display_name":"Train","score":0.3447999954223633},{"id":"https://openalex.org/keywords/work","display_name":"Work (physics)","score":0.3156999945640564}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7141000032424927},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5497000217437744},{"id":"https://openalex.org/C86844869","wikidata":"https://www.wikidata.org/wiki/Q2798820","display_name":"Hacker","level":2,"score":0.487199991941452},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.4117000102996826},{"id":"https://openalex.org/C28427503","wikidata":"https://www.wikidata.org/wiki/Q13580300","display_name":"Internal model","level":3,"score":0.40790000557899475},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.4011000096797943},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.37130001187324524},{"id":"https://openalex.org/C190839683","wikidata":"https://www.wikidata.org/wiki/Q2448197","display_name":"Train","level":2,"score":0.3447999954223633},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3384999930858612},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.3156999945640564},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.28850001096725464},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2863999903202057},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2782000005245209}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.04069","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.04069","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.04069","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.04069","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.4515845775604248}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Fine-tuned":[0],"large":[1],"language":[2,172],"models":[3],"can":[4,40,127],"exhibit":[5,110],"reward-hacking":[6,53,83,100,118],"behavior":[7,39],"arising":[8],"from":[9,17,55,101],"emergent":[10,159],"misalignment,":[11],"which":[12],"is":[13],"difficult":[14],"to":[15,78,105],"detect":[16],"final":[18],"outputs":[19],"alone.":[20],"While":[21],"prior":[22],"work":[23],"has":[24],"studied":[25],"reward":[26,143],"hacking":[27],"at":[28],"the":[29,135],"level":[30],"of":[31,82,137,158],"completed":[32],"responses,":[33],"it":[34],"remains":[35],"unclear":[36],"whether":[37],"such":[38],"be":[41,128],"identified":[42],"during":[43,114],"generation.":[44],"We":[45],"propose":[46],"an":[47],"activation-based":[48],"monitoring":[49,151,169],"approach":[50],"that":[51,94,148],"detects":[52],"signals":[54,119],"internal":[56,95,149],"representations":[57],"as":[58],"a":[59,153],"model":[60,87],"generates":[61],"its":[62],"response.":[63],"Our":[64],"method":[65],"trains":[66],"sparse":[67],"autoencoders":[68],"on":[69],"residual":[70],"stream":[71],"activations":[72],"and":[73,89,109,126,155],"applies":[74],"lightweight":[75],"linear":[76],"classifiers":[77],"produce":[79],"token-level":[80],"estimates":[81],"activity.":[84],"Across":[85],"multiple":[86],"families":[88],"fine-tuning":[90],"mixtures,":[91],"we":[92],"find":[93],"activation":[96,150],"patterns":[97],"reliably":[98],"distinguish":[99],"benign":[102],"behavior,":[103],"generalize":[104],"unseen":[106],"mixed-policy":[107],"adapters,":[108],"model-dependent":[111],"temporal":[112],"structure":[113],"chain-of-thought":[115,138],"reasoning.":[116],"Notably,":[117],"often":[120],"emerge":[121],"early,":[122],"persist":[123],"throughout":[124],"reasoning,":[125],"amplified":[129],"by":[130],"increased":[131],"test-time":[132],"compute":[133],"in":[134],"form":[136],"prompting":[139],"under":[140],"weakly":[141],"specified":[142],"objectives.":[144],"These":[145],"results":[146],"suggest":[147],"provides":[152],"complementary":[154],"earlier":[156],"signal":[157],"misalignment":[160],"than":[161],"output-based":[162],"evaluation,":[163],"supporting":[164],"more":[165],"robust":[166],"post-deployment":[167],"safety":[168],"for":[170],"fine-tuned":[171],"models.":[173]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-06T00:00:00"}
