{"id":"https://openalex.org/W7138873576","doi":"https://doi.org/10.1609/aaai.v40i44.41123","title":"FindTheFlaws: Annotated Errors for Detecting Flawed Reasoning and Scalable Oversight Research","display_name":"FindTheFlaws: Annotated Errors for Detecting Flawed Reasoning and Scalable Oversight Research","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138873576","doi":"https://doi.org/10.1609/aaai.v40i44.41123"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i44.41123","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i44.41123","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/41123/45084","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/41123/45084","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5000771881","display_name":"Gabriel Recchia","orcid":"https://orcid.org/0000-0002-0210-8635"},"institutions":[{"id":"https://openalex.org/I150545927","display_name":"MODUL University Vienna","ror":"https://ror.org/04v2brz27","country_code":"AT","type":"education","lineage":["https://openalex.org/I150545927"]},{"id":"https://openalex.org/I3130844761","display_name":"MODUL University Dubai","ror":"https://ror.org/04rrhkc58","country_code":"AE","type":"education","lineage":["https://openalex.org/I150545927","https://openalex.org/I3130844761"]}],"countries":["AE","AT"],"is_corresponding":true,"raw_author_name":"Gabriel Recchia","raw_affiliation_strings":["Modulo Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Modulo Research","institution_ids":["https://openalex.org/I3130844761","https://openalex.org/I150545927"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130126453","display_name":"Chatrik Singh Mangat","orcid":null},"institutions":[{"id":"https://openalex.org/I4210159747","display_name":"Vector Oncology (United States)","ror":"https://ror.org/05mkxvs25","country_code":"US","type":"company","lineage":["https://openalex.org/I4210159747"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chatrik Singh Mangat","raw_affiliation_strings":["Vector Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Vector Research","institution_ids":["https://openalex.org/I4210159747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013682899","display_name":"Issac Yuk-fai Li","orcid":null},"institutions":[{"id":"https://openalex.org/I20089843","display_name":"Princeton University","ror":"https://ror.org/00hx57361","country_code":"US","type":"education","lineage":["https://openalex.org/I20089843"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Issac Li","raw_affiliation_strings":["Princeton University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Princeton University","institution_ids":["https://openalex.org/I20089843"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5113971453","display_name":"G Krishnakumar","orcid":null},"institutions":[{"id":"https://openalex.org/I4210161256","display_name":"Impact","ror":"https://ror.org/05d9dsr70","country_code":"CA","type":"nonprofit","lineage":["https://openalex.org/I4210161256"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Gayatri Krishnakumar","raw_affiliation_strings":["Impact Academy"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Impact Academy","institution_ids":["https://openalex.org/I4210161256"]}]}],"institutions":[],"countries_distinct_count":4,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5000771881"],"corresponding_institution_ids":["https://openalex.org/I150545927","https://openalex.org/I3130844761"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.68999507,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"44","first_page":"37867","last_page":"37876"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.4255000054836273,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.4255000054836273,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.2614000141620636,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.07010000199079514,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.8057000041007996},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.7907000184059143},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.7634000182151794},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3955000042915344},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.38749998807907104}],"concepts":[{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.8057000041007996},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8048999905586243},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.7907000184059143},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.7634000182151794},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.4636000096797943},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39899998903274536},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3955000042915344},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.38749998807907104},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38100001215934753},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.3449999988079071},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3361999988555908},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2793999910354614},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.25440001487731934}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i44.41123","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i44.41123","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/41123/45084","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i44.41123","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i44.41123","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/41123/45084","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.6875044703483582,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7138873576.pdf","grobid_xml":"https://content.openalex.org/works/W7138873576.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"AI":[1,24],"models":[2,45,170],"tackle":[3],"increasingly":[4],"complex":[5],"problems,":[6],"ensuring":[7],"reliable":[8],"human":[9],"oversight":[10,168],"becomes":[11],"more":[12,172,182],"challenging":[13],"due":[14],"to":[15,22,36,81],"the":[16,74,127,148],"difficulty":[17],"of":[18,73,76,117,160],"verifying":[19],"solutions.":[20],"Approaches":[21],"scaling":[23],"supervision":[25],"include":[26,88],"debate,":[27],"in":[28,33,43,49,55,147],"which":[29,44,56],"two":[30],"agents":[31],"engage":[32],"structured":[34],"dialogue":[35],"help":[37],"a":[38,57,68,115,158],"judge":[39],"evaluate":[40,151],"claims;":[41],"critique,":[42],"identify":[46],"potential":[47],"flaws":[48],"proposed":[50],"solutions;":[51],"and":[52,78,94,126,134,156],"prover-verifier":[53],"games,":[54],"capable":[58,70,183],"'prover'":[59],"model":[60],"generates":[61],"solutions":[62,93,98,136],"that":[63,87,162],"must":[64],"be":[65,164],"verifiable":[66],"by":[67],"less":[69],"'verifier'.":[71],"Evaluations":[72],"scalability":[75],"these":[77],"similar":[79],"approaches":[80],"difficult":[82],"problems":[83],"benefit":[84],"from":[85],"datasets":[86,120,176],"(1)":[89],"long-form":[90,96,135],"expert-verified":[91],"correct":[92],"(2)":[95],"flawed":[97],"with":[99,137],"annotations":[100,139],"highlighting":[101],"specific":[102,145],"errors,":[103],"but":[104],"few":[105],"are":[106],"available.":[107],"To":[108],"address":[109],"this":[110],"gap,":[111],"we":[112],"present":[113],"FindTheFlaws,":[114],"group":[116],"five":[118],"diverse":[119],"spanning":[121],"medicine,":[122],"mathematics,":[123],"science,":[124],"coding,":[125],"Lojban":[128],"language.":[129],"Each":[130],"dataset":[131],"contains":[132],"questions":[133],"expert":[138],"validating":[140],"their":[141],"correctness":[142],"or":[143],"identifying":[144],"error(s)":[146],"reasoning.":[149],"We":[150],"frontier":[152],"models'":[153],"critiquing":[154],"capabilities":[155],"observe":[157],"range":[159],"performance":[161],"can":[163,177],"leveraged":[165],"for":[166,181],"scalable":[167],"experiments:":[169],"performing":[171],"poorly":[173],"on":[174],"particular":[175],"serve":[178],"as":[179],"judges/verifiers":[180],"models.":[184]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-03-20T00:00:00"}
