{"id":"https://openalex.org/W7150755458","doi":"https://doi.org/10.48550/arxiv.2604.02485","title":"Failing to Falsify: Evaluating and Mitigating Confirmation Bias in Language Models","display_name":"Failing to Falsify: Evaluating and Mitigating Confirmation Bias in Language Models","publication_year":2026,"publication_date":"2026-04-02","ids":{"openalex":"https://openalex.org/W7150755458","doi":"https://doi.org/10.48550/arxiv.2604.02485"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.02485","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02485","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.02485","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5034717373","display_name":"AD Jhaveri","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jhaveri, Ayush Rajesh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035393546","display_name":"Anthony GX-Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"GX-Chen, Anthony","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133018111","display_name":"Ilia Sucholutsky","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sucholutsky, Ilia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133063490","display_name":"Eunsol Choi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Choi, Eunsol","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5034717373"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.23989999294281006,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.23989999294281006,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.14810000360012054,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.1469999998807907,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/confirmation-bias","display_name":"Confirmation bias","score":0.7466999888420105},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5878999829292297},{"id":"https://openalex.org/keywords/intervention","display_name":"Intervention (counseling)","score":0.5095000267028809},{"id":"https://openalex.org/keywords/psychological-intervention","display_name":"Psychological intervention","score":0.42829999327659607},{"id":"https://openalex.org/keywords/dynamic-inconsistency","display_name":"Dynamic inconsistency","score":0.40610000491142273},{"id":"https://openalex.org/keywords/motivated-reasoning","display_name":"Motivated reasoning","score":0.3321000039577484},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.28769999742507935}],"concepts":[{"id":"https://openalex.org/C79585631","wikidata":"https://www.wikidata.org/wiki/Q431498","display_name":"Confirmation bias","level":2,"score":0.7466999888420105},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5878999829292297},{"id":"https://openalex.org/C2780665704","wikidata":"https://www.wikidata.org/wiki/Q959298","display_name":"Intervention (counseling)","level":2,"score":0.5095000267028809},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.504800021648407},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.5008000135421753},{"id":"https://openalex.org/C27415008","wikidata":"https://www.wikidata.org/wiki/Q7256382","display_name":"Psychological intervention","level":2,"score":0.42829999327659607},{"id":"https://openalex.org/C127729010","wikidata":"https://www.wikidata.org/wiki/Q60165","display_name":"Dynamic inconsistency","level":2,"score":0.40610000491142273},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.39469999074935913},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.36410000920295715},{"id":"https://openalex.org/C2776325391","wikidata":"https://www.wikidata.org/wiki/Q6917865","display_name":"Motivated reasoning","level":3,"score":0.3321000039577484},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.28769999742507935},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.28459998965263367},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.28060001134872437},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2705000042915344},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C2992700788","wikidata":"https://www.wikidata.org/wiki/Q8461","display_name":"Racial bias","level":3,"score":0.266400009393692},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.2651999890804291},{"id":"https://openalex.org/C2991991027","wikidata":"https://www.wikidata.org/wiki/Q6007314","display_name":"Implicit bias","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.02485","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02485","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.02485","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02485","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.4205327033996582}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Confirmation":[0],"bias,":[1,88],"the":[2,30,66,72,111,121,171],"tendency":[3],"to":[4,92,99,104,123,149,167],"seek":[5],"evidence":[6],"that":[7,84,177,189],"supports":[8],"rather":[9,96],"than":[10,97],"challenges":[11],"one's":[12,15],"belief,":[13],"hinders":[14],"reasoning":[16],"ability.":[17],"We":[18,114,130],"examine":[19],"whether":[20,63],"large":[21],"language":[22],"models":[23],"(LLMs)":[24],"exhibit":[25,86],"confirmation":[26,87,139,156,178],"bias":[27,140,157,179],"by":[28,158],"adapting":[29],"rule-discovery":[31],"study":[32],"from":[33,147],"human":[34],"psychology:":[35],"given":[36],"a":[37,56,168,181],"sequence":[38],"of":[39,77,110,183],"three":[40],"numbers":[41],"(a":[42],"\"triple\"),":[43],"an":[44,48],"agent":[45,122],"engages":[46],"in":[47,141,185],"interactive":[49],"feedback":[50,61],"loop":[51],"where":[52],"it":[53,64,190],"(1)":[54],"proposes":[55],"new":[57,169],"triple,":[58],"(2)":[59],"receives":[60],"on":[62,151],"satisfies":[65],"hidden":[67,112],"rule,":[68],"and":[69,80,106,188],"(3)":[70],"guesses":[71],"rule.":[73,113],"Across":[74],"eleven":[75],"LLMs":[76,85,133,184],"multiple":[78],"families":[79],"scales,":[81],"we":[82,154],"find":[83,131],"often":[89],"proposing":[90],"triples":[91],"confirm":[93],"their":[94],"hypothesis":[95,186],"trying":[98],"falsify":[100],"it.":[101],"This":[102],"leads":[103],"slower":[105],"less":[107],"frequent":[108],"discovery":[109,145],"further":[115],"explore":[116],"intervention":[117],"strategies":[118],"(e.g.,":[119],"encouraging":[120],"consider":[124],"counter":[125],"examples)":[126],"developed":[127],"for":[128,198],"humans.":[129,199],"prompting":[132],"with":[134],"such":[135],"instruction":[136],"consistently":[137],"decreases":[138],"LLMs,":[142,163],"improving":[143],"rule":[144],"rates":[146],"42%":[148],"56%":[150],"average.":[152],"Lastly,":[153],"mitigate":[155],"distilling":[159],"intervention-induced":[160],"behavior":[161],"into":[162],"showing":[164],"promising":[165],"generalization":[166],"task,":[170],"Blicket":[172],"test.":[173],"Our":[174],"work":[175],"shows":[176],"is":[180],"limitation":[182],"exploration,":[187],"can":[191],"be":[192],"mitigated":[193],"via":[194],"injecting":[195],"interventions":[196],"designed":[197]},"counts_by_year":[],"updated_date":"2026-04-07T06:06:30.997549","created_date":"2026-04-07T00:00:00"}
