{"id":"https://openalex.org/W7118414216","doi":"https://doi.org/10.1109/access.2026.3651579","title":"On the Limits of LLM Reasoning: Evidence From Contamination, Translation, and Answer Modification in Multiple-Choice Benchmarks","display_name":"On the Limits of LLM Reasoning: Evidence From Contamination, Translation, and Answer Modification in Multiple-Choice Benchmarks","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7118414216","doi":"https://doi.org/10.1109/access.2026.3651579"},"language":null,"primary_location":{"id":"doi:10.1109/access.2026.3651579","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2026.3651579","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1109/access.2026.3651579","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5077697330","display_name":"Eva S\u00e1nchez Salido","orcid":null},"institutions":[{"id":"https://openalex.org/I178450904","display_name":"Universidad Nacional de Educaci\u00f3n a Distancia","ror":"https://ror.org/02msb5n36","country_code":"ES","type":"education","lineage":["https://openalex.org/I178450904"]}],"countries":["ES"],"is_corresponding":true,"raw_author_name":"Eva S\u00e1nchez Salido","raw_affiliation_strings":["Research Group in NLP &#x0026; IR, National Distance Education University, Madrid, Spain"],"affiliations":[{"raw_affiliation_string":"Research Group in NLP &#x0026; IR, National Distance Education University, Madrid, Spain","institution_ids":["https://openalex.org/I178450904"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111768991","display_name":"Julio A. Gonzalo","orcid":null},"institutions":[{"id":"https://openalex.org/I178450904","display_name":"Universidad Nacional de Educaci\u00f3n a Distancia","ror":"https://ror.org/02msb5n36","country_code":"ES","type":"education","lineage":["https://openalex.org/I178450904"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Julio Gonzalo","raw_affiliation_strings":["Research Group in NLP &#x0026; IR, National Distance Education University, Madrid, Spain"],"affiliations":[{"raw_affiliation_string":"Research Group in NLP &#x0026; IR, National Distance Education University, Madrid, Spain","institution_ids":["https://openalex.org/I178450904"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018641291","display_name":"Guillermo Marco","orcid":null},"institutions":[{"id":"https://openalex.org/I178450904","display_name":"Universidad Nacional de Educaci\u00f3n a Distancia","ror":"https://ror.org/02msb5n36","country_code":"ES","type":"education","lineage":["https://openalex.org/I178450904"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Guillermo Marco","raw_affiliation_strings":["Research Group in NLP &#x0026; IR, National Distance Education University, Madrid, Spain"],"affiliations":[{"raw_affiliation_string":"Research Group in NLP &#x0026; IR, National Distance Education University, Madrid, Spain","institution_ids":["https://openalex.org/I178450904"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5077697330"],"corresponding_institution_ids":["https://openalex.org/I178450904"],"apc_list":{"value":1850,"currency":"USD","value_usd":1850},"apc_paid":{"value":1850,"currency":"USD","value_usd":1850},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.06745955,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"14","issue":null,"first_page":"9384","last_page":"9393"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.8345999717712402,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.8345999717712402,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.04129999876022339,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.013799999840557575,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.8456000089645386},{"id":"https://openalex.org/keywords/conflation","display_name":"Conflation","score":0.8162999749183655},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.5649999976158142},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.4738999903202057},{"id":"https://openalex.org/keywords/test","display_name":"Test (biology)","score":0.41600000858306885},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.3212999999523163}],"concepts":[{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.8456000089645386},{"id":"https://openalex.org/C130440534","wikidata":"https://www.wikidata.org/wiki/Q14946528","display_name":"Conflation","level":2,"score":0.8162999749183655},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6682000160217285},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.5649999976158142},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.511900007724762},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.4738999903202057},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4650000035762787},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.41600000858306885},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.32829999923706055},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3212999999523163},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2924000024795532},{"id":"https://openalex.org/C138268822","wikidata":"https://www.wikidata.org/wiki/Q1051925","display_name":"Resolution (logic)","level":2,"score":0.28110000491142273},{"id":"https://openalex.org/C158600405","wikidata":"https://www.wikidata.org/wiki/Q5054566","display_name":"Causal inference","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.2741999924182892},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2678000032901764},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.2567000091075897}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/access.2026.3651579","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2026.3651579","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1109/access.2026.3651579","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2026.3651579","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W2996556191","https://openalex.org/W3016970897","https://openalex.org/W3171654528","https://openalex.org/W4226040778","https://openalex.org/W4294214983","https://openalex.org/W4385571689","https://openalex.org/W4385572001","https://openalex.org/W4385573636","https://openalex.org/W4387782510","https://openalex.org/W4389518953","https://openalex.org/W4389523725","https://openalex.org/W4396762144","https://openalex.org/W4400681340","https://openalex.org/W4402670101","https://openalex.org/W4402671287","https://openalex.org/W4403122979","https://openalex.org/W4404781632","https://openalex.org/W4406892414","https://openalex.org/W4409131149","https://openalex.org/W4412581454","https://openalex.org/W4412886673","https://openalex.org/W4412886797","https://openalex.org/W4412887729","https://openalex.org/W4412889529","https://openalex.org/W4412944731"],"related_works":[],"abstract_inverted_index":{"Multiple-choice":[0],"benchmarks":[1],"are":[2,31,114],"widely":[3],"used":[4],"to":[5,35,64,144,149,182],"assess":[6],"LLMs,":[7],"yet":[8],"their":[9],"accuracy":[10,109,179,188],"scores":[11],"often":[12],"conflate":[13],"memorization\u2014understood":[14],"as":[15,204],"pattern-based":[16],"recall\u2014with":[17],"genuine":[18],"reasoning,":[19],"that":[20,75,158],"is,":[21],"inference":[22,101,172],"beyond":[23],"surface":[24],"pattern":[25],"transfer,":[26],"especially":[27],"when":[28,171],"test":[29],"sets":[30],"public":[32,49,122],"and":[33,70,90,106,131,136,141,165,177],"prone":[34],"contamination.":[36],"To":[37],"disentangle":[38],"these":[39],"effects,":[40],"we":[41],"evaluate":[42],"models":[43,212],"under":[44,111,217],"three":[45],"experimental":[46],"conditions:":[47],"(i)":[48],"(MMLU)":[50],"vs.":[51,57,127],"private":[52],"(UNED-Access)":[53],"data;":[54],"(ii)":[55],"original":[56],"professionally":[58],"translated":[59],"questions":[60],"(English/Spanish;":[61],"less":[62],"likely":[63],"appear":[65,143],"verbatim":[66],"in":[67,185],"training":[68,198],"data);":[69],"(iii)":[71],"an":[72],"answer":[73,112],"modification":[74,113],"replaces":[76],"the":[77,83,87,121,150,153,205,218],"correct":[78],"option":[79],"with":[80,117,210],"\u2018\u2018None":[81,151],"of":[82,152,194,208],"other":[84,154],"answers\u2019\u2019\u2014which":[85],"becomes":[86,189],"right":[88],"choice":[89],"dissociates":[91],"success":[92],"from":[93],"previously":[94],"seen":[95],"tokens":[96],"or":[97],"concepts,":[98],"requiring":[99],"implicit":[100],"steps.":[102],"Across":[103],"16":[104],"proprietary":[105],"open-weights":[107],"models,":[108],"drops":[110],"substantial":[115],"(10%\u201393%),":[116],"larger":[118],"declines":[119],"on":[120,125,129],"dataset":[123],"(56%":[124],"MMLU":[126],"51%":[128],"UNED-Access)":[130],"minimal":[132],"differences":[133],"between":[134],"originals":[135],"translations.":[137],"Taken":[138],"together,":[139],"contamination":[140],"translation":[142],"be":[145],"second-order":[146],"factors":[147],"compared":[148],"answers\u2019\u2019":[155],"condition,":[156],"suggesting":[157],"current":[159],"LLMs":[160],"generalize":[161],"well":[162],"across":[163],"datasets":[164],"languages":[166],"but":[167],"show":[168],"marked":[169],"limitations":[170],"is":[173],"required.":[174],"Model":[175],"size":[176],"baseline":[178],"prove":[180],"insufficient":[181],"predict":[183],"robustness\u2014although":[184],"low-contamination":[186],"settings,":[187],"a":[190],"more":[191],"reliable":[192],"indicator":[193],"inference-based":[195],"behavior.":[196],"Instead,":[197],"strategies":[199],"explicitly":[200],"targeting":[201],"reasoning":[202],"emerge":[203],"primary":[206],"drivers":[207],"robustness,":[209],"reasoning-oriented":[211],"consistently":[213],"showing":[214],"greater":[215],"stability":[216],"NOTO":[219],"substitution.":[220]},"counts_by_year":[],"updated_date":"2026-02-23T20:09:44.859080","created_date":"2026-01-08T00:00:00"}
