{"id":"https://openalex.org/W4417094175","doi":"https://doi.org/10.48550/arxiv.2505.11189","title":"Can Global XAI Methods Reveal Injected Bias in LLMs? SHAP vs Rule Extraction vs RuleSHAP","display_name":"Can Global XAI Methods Reveal Injected Bias in LLMs? SHAP vs Rule Extraction vs RuleSHAP","publication_year":2025,"publication_date":"2025-05-16","ids":{"openalex":"https://openalex.org/W4417094175","doi":"https://doi.org/10.48550/arxiv.2505.11189"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2505.11189","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2505.11189","pdf_url":"https://arxiv.org/pdf/2505.11189","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2505.11189","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5056310041","display_name":"Francesco Sovrano","orcid":"https://orcid.org/0000-0002-6285-1041"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sovrano, Francesco","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5056310041"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.5070000290870667,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.5070000290870667,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11147","display_name":"Misinformation and Its Impacts","score":0.1670999974012375,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.04820000007748604,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.8644999861717224},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5343000292778015},{"id":"https://openalex.org/keywords/obstacle","display_name":"Obstacle","score":0.5216000080108643},{"id":"https://openalex.org/keywords/default","display_name":"Default","score":0.4767000079154968},{"id":"https://openalex.org/keywords/yield","display_name":"Yield (engineering)","score":0.33980000019073486},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.3239000141620636}],"concepts":[{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.8644999861717224},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5343000292778015},{"id":"https://openalex.org/C2776650193","wikidata":"https://www.wikidata.org/wiki/Q264661","display_name":"Obstacle","level":2,"score":0.5216000080108643},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.515999972820282},{"id":"https://openalex.org/C69637215","wikidata":"https://www.wikidata.org/wiki/Q702362","display_name":"Default","level":2,"score":0.4767000079154968},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37950000166893005},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35749998688697815},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3422999978065491},{"id":"https://openalex.org/C134121241","wikidata":"https://www.wikidata.org/wiki/Q899301","display_name":"Yield (engineering)","level":2,"score":0.33980000019073486},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.3239000141620636},{"id":"https://openalex.org/C2776990098","wikidata":"https://www.wikidata.org/wiki/Q13579947","display_name":"Misinformation","level":2,"score":0.3190999925136566},{"id":"https://openalex.org/C164752517","wikidata":"https://www.wikidata.org/wiki/Q5570875","display_name":"Global optimization","level":2,"score":0.30630001425743103},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.3061000108718872},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2985999882221222},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.2827000021934509},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.28209999203681946},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2768000066280365},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.2563999891281128}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2505.11189","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2505.11189","pdf_url":"https://arxiv.org/pdf/2505.11189","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2505.11189","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2505.11189","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2505.11189","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2505.11189","pdf_url":"https://arxiv.org/pdf/2505.11189","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"can":[4,51,59],"amplify":[5],"misinformation,":[6],"undermining":[7],"societal":[8],"goals":[9],"like":[10],"the":[11],"UN":[12],"SDGs.":[13],"We":[14,92],"study":[15],"three":[16],"documented":[17],"drivers":[18],"of":[19,55,130],"misinformation":[20],"(valence":[21],"framing,":[22],"information":[23],"overload,":[24],"and":[25,50,100,140],"oversimplification)":[26],"which":[27],"are":[28,85],"often":[29],"shaped":[30],"by":[31,95,195],"one's":[32],"default":[33],"beliefs.":[34],"Building":[35],"on":[36,198],"evidence":[37],"that":[38,77,149,177],"LLMs":[39,69,138],"encode":[40],"such":[41],"defaults":[42],"(e.g.,":[43],"\"joy":[44],"is":[45,48,76],"positive,\"":[46],"\"math":[47],"complex\")":[49],"act":[52],"as":[53,70],"\"bags":[54],"heuristics,\"":[56],"we":[57,125,147,171],"ask:":[58],"general":[60],"belief-driven":[61,208],"heuristics":[62,118,129,191],"behind":[63],"misinformative":[64],"behaviour":[65],"be":[66],"recovered":[67],"from":[68],"clear":[71],"rules?":[72],"A":[73],"key":[74],"obstacle":[75],"global":[78,97,113,155,179],"rule-extraction":[79,175],"methods":[80],"in":[81,119,210],"explainable":[82],"AI":[83],"(XAI)":[84],"built":[86],"for":[87,206],"numerical":[88,104],"inputs/outputs,":[89],"not":[90,163],"text.":[91],"address":[93],"this":[94,169],"eliciting":[96],"LLM":[98],"beliefs":[99],"mapping":[101],"them":[102],"to":[103,115,185],"scores":[105],"via":[106,142],"statistically":[107],"reliable":[108],"abstractions,":[109],"thereby":[110],"enabling":[111],"off-the-shelf":[112],"XAI":[114],"detect":[116],"belief-related":[117],"LLMs.":[120,211],"To":[121,167],"obtain":[122],"ground":[123],"truth,":[124],"hard-code":[126],"bias-inducing":[127],"nonlinear":[128],"increasing":[131],"complexity":[132],"(univariate,":[133],"conjunctive,":[134],"nonconvex)":[135],"into":[136],"popular":[137],"(ChatGPT":[139],"Llama)":[141],"system":[143],"instructions.":[144],"This":[145],"way,":[146],"find":[148],"RuleFit":[150,194],"under-detects":[151],"non-univariate":[152,188],"biases,":[153],"while":[154],"SHAP":[156],"better":[157,186],"approximates":[158],"conjunctive":[159],"ones":[160],"but":[161],"does":[162],"yield":[164],"actionable":[165],"rules.":[166],"bridge":[168],"gap,":[170],"propose":[172],"RuleSHAP,":[173],"a":[174,203],"algorithm":[176],"couples":[178],"SHAP-value":[180],"aggregations":[181],"with":[182],"rule":[183],"induction":[184],"capture":[187],"bias,":[189],"improving":[190],"detection":[192],"over":[193],"+94%":[196],"(MRR@1)":[197],"average.":[199],"Our":[200],"results":[201],"provide":[202],"practical":[204],"pathway":[205],"revealing":[207],"biases":[209]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
