{"id":"https://openalex.org/W7151562980","doi":"https://doi.org/10.48550/arxiv.2604.03858","title":"A Bayesian Information-Theoretic Approach to Data Attribution","display_name":"A Bayesian Information-Theoretic Approach to Data Attribution","publication_year":2026,"publication_date":"2026-04-04","ids":{"openalex":"https://openalex.org/W7151562980","doi":"https://doi.org/10.48550/arxiv.2604.03858"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.03858","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03858","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.03858","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133110301","display_name":"Dharmesh Tailor","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tailor, Dharmesh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047511363","display_name":"Nicol\u00f2 Felicioni","orcid":"https://orcid.org/0000-0002-3555-7760"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Felicioni, Nicol\u00f2","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5043239539","display_name":"Kamil Ciosek","orcid":"https://orcid.org/0000-0002-0238-9393"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ciosek, Kamil","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12814","display_name":"Gaussian Processes and Bayesian Inference","score":0.6190000176429749,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12814","display_name":"Gaussian Processes and Bayesian Inference","score":0.6190000176429749,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.05829999968409538,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11303","display_name":"Bayesian Modeling and Causal Inference","score":0.05130000039935112,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8054999709129333},{"id":"https://openalex.org/keywords/bayesian-probability","display_name":"Bayesian probability","score":0.5493999719619751},{"id":"https://openalex.org/keywords/attribution","display_name":"Attribution","score":0.5412999987602234},{"id":"https://openalex.org/keywords/counterfactual-thinking","display_name":"Counterfactual thinking","score":0.4957999885082245},{"id":"https://openalex.org/keywords/gaussian-process","display_name":"Gaussian process","score":0.48080000281333923},{"id":"https://openalex.org/keywords/representativeness-heuristic","display_name":"Representativeness heuristic","score":0.384799987077713},{"id":"https://openalex.org/keywords/entropy","display_name":"Entropy (arrow of time)","score":0.37929999828338623},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.3709000051021576}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8054999709129333},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5911999940872192},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.5493999719619751},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5453000068664551},{"id":"https://openalex.org/C143299363","wikidata":"https://www.wikidata.org/wiki/Q900584","display_name":"Attribution","level":2,"score":0.5412999987602234},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5163000226020813},{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.4957999885082245},{"id":"https://openalex.org/C61326573","wikidata":"https://www.wikidata.org/wiki/Q1496376","display_name":"Gaussian process","level":3,"score":0.48080000281333923},{"id":"https://openalex.org/C37381756","wikidata":"https://www.wikidata.org/wiki/Q20203288","display_name":"Representativeness heuristic","level":2,"score":0.384799987077713},{"id":"https://openalex.org/C106301342","wikidata":"https://www.wikidata.org/wiki/Q4117933","display_name":"Entropy (arrow of time)","level":2,"score":0.37929999828338623},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.3709000051021576},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.36640000343322754},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.35339999198913574},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.35109999775886536},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.3427000045776367},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.3319000005722046},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.32600000500679016},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3167000114917755},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.30970001220703125},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.287200003862381},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.28679999709129333},{"id":"https://openalex.org/C160234255","wikidata":"https://www.wikidata.org/wiki/Q812535","display_name":"Bayesian inference","level":3,"score":0.2750999927520752},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.26089999079704285},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.25760000944137573}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.03858","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03858","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.03858","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03858","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Training":[0],"Data":[1],"Attribution":[2],"(TDA)":[3],"seeks":[4],"to":[5,10,58,96,128],"trace":[6],"model":[7],"predictions":[8],"back":[9],"influential":[11],"training":[12],"examples,":[13],"enhancing":[14],"interpretability":[15],"and":[16,100,120],"safety.":[17],"We":[18,74],"formulate":[19],"TDA":[20],"as":[21],"a":[22,40,66,102],"Bayesian":[23],"information-theoretic":[24],"problem:":[25],"subsets":[26],"are":[27],"scored":[28],"by":[29],"the":[30,36],"information":[31,63],"loss":[32,64],"they":[33],"induce":[34],"-":[35],"entropy":[37],"increase":[38],"at":[39],"query":[41],"when":[42],"removed.":[43],"This":[44],"criterion":[45],"credits":[46],"examples":[47],"for":[48,82,88,105],"resolving":[49],"predictive":[50],"uncertainty":[51],"rather":[52],"than":[53],"label":[54],"noise.":[55],"To":[56],"scale":[57],"modern":[59,129],"networks,":[60],"we":[61,94],"approximate":[62],"using":[65],"Gaussian":[67],"Process":[68],"surrogate":[69],"built":[70],"from":[71],"tangent":[72],"features.":[73],"show":[75,112],"this":[76],"aligns":[77],"with":[78,135],"classical":[79],"influence":[80],"scores":[81],"single-example":[83],"attribution":[84,107],"while":[85,131],"promoting":[86],"diversity":[87],"subsets.":[89],"For":[90],"even":[91],"larger-scale":[92],"retrieval,":[93],"relax":[95],"an":[97],"information-gain":[98],"objective":[99],"add":[101],"variance":[103],"correction":[104],"scalable":[106],"in":[108],"vector":[109],"databases.":[110],"Experiments":[111],"competitive":[113],"performance":[114],"on":[115],"counterfactual":[116],"sensitivity,":[117],"ground-truth":[118],"retrieval":[119],"coreset":[121],"selection,":[122],"showing":[123],"that":[124],"our":[125],"method":[126],"scales":[127],"architectures":[130],"bridging":[132],"principled":[133],"measures":[134],"practice.":[136]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-04-08T00:00:00"}
