{"id":"https://openalex.org/W7134068034","doi":"https://doi.org/10.48550/arxiv.2603.04582","title":"Self-Attribution Bias: When AI Monitors Go Easy on Themselves","display_name":"Self-Attribution Bias: When AI Monitors Go Easy on Themselves","publication_year":2026,"publication_date":"2026-03-04","ids":{"openalex":"https://openalex.org/W7134068034","doi":"https://doi.org/10.48550/arxiv.2603.04582"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.04582","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128234288","display_name":"Dipika Khullar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Khullar, Dipika","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103705890","display_name":"Jack W. Hopk\u00edns","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hopkins, Jack","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127096224","display_name":"Rowan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Rowan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128248068","display_name":"Fabien Roger","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roger, Fabien","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.3328000009059906,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.3328000009059906,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.11420000344514847,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.054999999701976776,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.6769000291824341},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.5149999856948853},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4932999908924103},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3864000141620636},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.20640000700950623}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7114999890327454},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.6769000291824341},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.5149999856948853},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4932999908924103},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.41280001401901245},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3864000141620636},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.3140000104904175},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.29899999499320984},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.20640000700950623},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.20340000092983246}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.04582","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.04582","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.04582","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.04582","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.6929814219474792}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Agentic":[0],"systems":[1],"increasingly":[2],"rely":[3],"on":[4,177,182],"language":[5],"models":[6],"to":[7,75,95,116,138,203],"monitor":[8,164],"their":[9,183],"own":[10,184],"behavior.":[11],"For":[12],"example,":[13],"coding":[14,107],"agents":[15],"may":[16],"self":[17],"critique":[18],"generated":[19,185],"code":[20],"for":[21],"pull":[22],"request":[23],"approval":[24],"or":[25,48,82,119],"assess":[26],"the":[27,41,50,59,70,86,97,133,140,159,163],"safety":[28],"of":[29,55,72],"tool-use":[30,109],"actions.":[31],"We":[32,65],"show":[33],"that":[34,113,158],"this":[35],"design":[36],"pattern":[37],"can":[38,189],"fail":[39,115],"when":[40,85,96,124,139],"action":[42,78,87,99,134,142,160],"is":[43,88,100,143],"presented":[44,57,149],"in":[45,49,61,131,145,150,199,208],"a":[46,62,73,127,146,151],"previous":[47,128],"same":[51,98,141],"assistant":[52,129],"turn":[53,130],"instead":[54],"being":[56],"by":[58,167],"user":[60,63,152],"turn.":[64,153],"define":[66],"self-attribution":[67,170],"bias":[68],"as":[69,79,91],"tendency":[71],"model":[74],"evaluate":[76],"an":[77],"more":[80,122,193],"correct":[81],"less":[83],"risky":[84],"implicitly":[89],"framed":[90],"its":[92],"own,":[93],"compared":[94,137],"evaluated":[101,144,176],"under":[102],"off-policy":[103],"attribution.":[104],"Across":[105],"four":[106],"and":[108],"datasets,":[110],"we":[111],"find":[112],"monitors":[114,173,191,207],"report":[117],"high-risk":[118],"low-correctness":[120],"actions":[121],"often":[123,175],"evaluation":[125],"follows":[126],"which":[132],"was":[135],"generated,":[136],"new":[147],"context":[148],"In":[154],"contrast,":[155],"explicitly":[156],"stating":[157],"comes":[161],"from":[162],"does":[165],"not":[166],"itself":[168],"induce":[169],"bias.":[171],"Because":[172],"are":[174,198],"fixed":[178],"examples":[179],"rather":[180],"than":[181,195],"actions,":[186],"these":[187],"evaluations":[188],"make":[190],"appear":[192],"reliable":[194],"they":[196],"actually":[197],"deployment,":[200],"leading":[201],"developers":[202],"unknowingly":[204],"deploy":[205],"inadequate":[206],"agentic":[209],"systems.":[210]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-07T00:00:00"}
