{"id":"https://openalex.org/W7131071232","doi":"https://doi.org/10.48550/arxiv.2602.18297","title":"Analyzing and Improving Chain-of-Thought Monitorability Through Information Theory","display_name":"Analyzing and Improving Chain-of-Thought Monitorability Through Information Theory","publication_year":2026,"publication_date":"2026-02-20","ids":{"openalex":"https://openalex.org/W7131071232","doi":"https://doi.org/10.48550/arxiv.2602.18297"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.18297","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126646931","display_name":"Usman Anwar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Anwar, Usman","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126607129","display_name":"Tim Bakker","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bakker, Tim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065551082","display_name":"Dana Kianfar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kianfar, Dana","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052890236","display_name":"Cristina Pinneri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pinneri, Cristina","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5005095861","display_name":"Christos Louizos","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Louizos, Christos","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.2020999938249588,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.2020999938249588,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.16500000655651093,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10639","display_name":"Advanced Software Engineering Methodologies","score":0.0868000015616417,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mutual-information","display_name":"Mutual information","score":0.6880999803543091},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6114000082015991},{"id":"https://openalex.org/keywords/conditional-mutual-information","display_name":"Conditional mutual information","score":0.48249998688697815},{"id":"https://openalex.org/keywords/information-theory","display_name":"Information theory","score":0.45669999718666077},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4043000042438507},{"id":"https://openalex.org/keywords/interaction-information","display_name":"Interaction information","score":0.3971000015735626},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.36329999566078186},{"id":"https://openalex.org/keywords/measure","display_name":"Measure (data warehouse)","score":0.36239999532699585}],"concepts":[{"id":"https://openalex.org/C152139883","wikidata":"https://www.wikidata.org/wiki/Q252973","display_name":"Mutual information","level":2,"score":0.6880999803543091},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6754999756813049},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6114000082015991},{"id":"https://openalex.org/C124805900","wikidata":"https://www.wikidata.org/wiki/Q5159269","display_name":"Conditional mutual information","level":3,"score":0.48249998688697815},{"id":"https://openalex.org/C52622258","wikidata":"https://www.wikidata.org/wiki/Q131222","display_name":"Information theory","level":2,"score":0.45669999718666077},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4246000051498413},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41909998655319214},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.41359999775886536},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4043000042438507},{"id":"https://openalex.org/C38764148","wikidata":"https://www.wikidata.org/wiki/Q17098245","display_name":"Interaction information","level":2,"score":0.3971000015735626},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.36329999566078186},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.36239999532699585},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3513000011444092},{"id":"https://openalex.org/C2983203078","wikidata":"https://www.wikidata.org/wiki/Q255166","display_name":"Information gain","level":2,"score":0.3325999975204468},{"id":"https://openalex.org/C152565575","wikidata":"https://www.wikidata.org/wiki/Q1124538","display_name":"Conditional random field","level":2,"score":0.328000009059906},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.319599986076355},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.2944999933242798},{"id":"https://openalex.org/C137822555","wikidata":"https://www.wikidata.org/wiki/Q2587068","display_name":"Information sensitivity","level":2,"score":0.2842999994754791},{"id":"https://openalex.org/C87868495","wikidata":"https://www.wikidata.org/wiki/Q750843","display_name":"Information processing","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C3020402766","wikidata":"https://www.wikidata.org/wiki/Q104376712","display_name":"Prior information","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C180198813","wikidata":"https://www.wikidata.org/wiki/Q121182","display_name":"Information system","level":2,"score":0.2567000091075897},{"id":"https://openalex.org/C44492722","wikidata":"https://www.wikidata.org/wiki/Q327069","display_name":"Conditional probability","level":2,"score":0.2547999918460846}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.18297","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.18297","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.18297","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.18297","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Chain-of-thought":[0],"(CoT)":[1],"monitors":[2,67],"are":[3],"LLM-based":[4],"systems":[5],"that":[6,35,60,106,129,138,149],"analyze":[7],"reasoning":[8],"traces":[9],"to":[10,33,76,94],"detect":[11],"when":[12,175,184],"outputs":[13,155],"may":[14,61],"exhibit":[15],"attributes":[16],"of":[17,57,65],"interest,":[18],"such":[19],"as":[20],"test-hacking":[21],"behavior":[22],"during":[23],"code":[24],"generation.":[25],"In":[26],"this":[27,118],"paper,":[28],"we":[29,120,162],"use":[30],"information-theoretic":[31],"analysis":[32],"show":[34,163],"non-zero":[36],"mutual":[37,152],"information":[38,70,83,153],"between":[39,154],"CoT":[40,51,66,107,172],"and":[41,87,142,156],"output":[42],"is":[43,188],"a":[44,144,178],"necessary":[45],"but":[46],"not":[47],"sufficient":[48],"condition":[49],"for":[50,135],"monitorability.":[52],"We":[53,103],"identify":[54],"two":[55,122],"sources":[56],"approximation":[58],"error":[59],"undermine":[62],"the":[63,74,78,82,92,96,99,132,185],"performance":[64],"in":[68,85],"practice:":[69],"gap,":[71],"which":[72,77,90,95],"measures":[73,91],"extent":[75,93],"monitor":[79,97,140,168],"can":[80,109],"extract":[81],"available":[84],"CoT,":[86],"elicitation":[88],"error,":[89],"approximates":[98],"optimal":[100],"monitoring":[101],"function.":[102],"further":[104],"demonstrate":[105],"monitorability":[108],"be":[110],"systematically":[111],"improved":[112],"through":[113],"targeted":[114],"training":[115,176],"objectives.":[116],"To":[117],"end,":[119],"propose":[121],"complementary":[123],"approaches:":[124],"(a)":[125],"an":[126],"oracle-based":[127],"method":[128],"directly":[130],"rewards":[131],"monitored":[133],"model":[134],"producing":[136],"CoTs":[137],"maximize":[139],"accuracy,":[141],"(b)":[143],"more":[145],"practical,":[146],"label-free":[147],"approach":[148],"maximizes":[150],"conditional":[151],"CoTs.":[157],"Across":[158],"multiple":[159],"different":[160],"environments,":[161],"both":[164],"methods":[165],"significantly":[166],"improve":[167],"accuracy":[169],"while":[170],"preventing":[171],"degeneration":[173],"even":[174],"against":[177],"monitor,":[179],"thereby":[180],"mitigating":[181],"reward":[182,187],"hacking":[183],"task":[186],"imperfectly":[189],"specified.":[190]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-24T00:00:00"}
