{"id":"https://openalex.org/W7153217318","doi":"https://doi.org/10.48550/arxiv.2604.07615","title":"ADAG: Automatically Describing Attribution Graphs","display_name":"ADAG: Automatically Describing Attribution Graphs","publication_year":2026,"publication_date":"2026-04-08","ids":{"openalex":"https://openalex.org/W7153217318","doi":"https://doi.org/10.48550/arxiv.2604.07615"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.07615","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07615","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.07615","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5082261951","display_name":"Aryaman Arora","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Arora, Aryaman","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102822963","display_name":"Zhengxuan Wu","orcid":"https://orcid.org/0000-0001-5581-8908"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Zhengxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133377659","display_name":"Jacob Steinhardt","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Steinhardt, Jacob","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5074026640","display_name":"Sarah Schwettmann","orcid":"https://orcid.org/0000-0001-6385-1396"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schwettmann, Sarah","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.8802000284194946,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.8802000284194946,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.01360000018030405,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.011599999852478504,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8913000226020813},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.7006999850273132},{"id":"https://openalex.org/keywords/tracing","display_name":"Tracing","score":0.641700029373169},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.5479000210762024},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5336999893188477},{"id":"https://openalex.org/keywords/interpretation","display_name":"Interpretation (philosophy)","score":0.5054000020027161},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4487000107765198},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4325000047683716}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8913000226020813},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7426999807357788},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.7006999850273132},{"id":"https://openalex.org/C138673069","wikidata":"https://www.wikidata.org/wiki/Q322229","display_name":"Tracing","level":2,"score":0.641700029373169},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6053000092506409},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.5479000210762024},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5336999893188477},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.5054000020027161},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4487000107765198},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4325000047683716},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4244999885559082},{"id":"https://openalex.org/C101814296","wikidata":"https://www.wikidata.org/wiki/Q5439685","display_name":"Feature model","level":3,"score":0.4237000048160553},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3912999927997589},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.38119998574256897},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3765000104904175},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3287999927997589},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.32420000433921814},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.30000001192092896},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.27810001373291016},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.2736999988555908},{"id":"https://openalex.org/C143299363","wikidata":"https://www.wikidata.org/wiki/Q900584","display_name":"Attribution","level":2,"score":0.27140000462532043},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.2702000141143799},{"id":"https://openalex.org/C64357122","wikidata":"https://www.wikidata.org/wiki/Q1149766","display_name":"Causality (physics)","level":2,"score":0.2671999931335449},{"id":"https://openalex.org/C146380142","wikidata":"https://www.wikidata.org/wiki/Q1137726","display_name":"Directed graph","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.25369998812675476}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.07615","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07615","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.07615","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07615","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7300552725791931,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"language":[1],"model":[2],"interpretability":[3],"research,":[4],"\\textbf{circuit":[5],"tracing}":[6],"aims":[7],"to":[8,15],"identify":[9],"which":[10,83,94,124,159],"internal":[11],"features":[12],"causally":[13],"contributed":[14],"a":[16,100,112,163],"particular":[17],"output":[18,106],"and":[19,105,119,126,147,151],"how":[20],"they":[21],"affected":[22],"each":[23,51],"other,":[24],"with":[25],"the":[26,30,48,54,65,68,96,131],"goal":[27],"of":[28,47,60,99,130,134],"explaining":[29],"computations":[31],"underlying":[32],"some":[33],"behaviour.":[34],"However,":[35],"all":[36],"prior":[37],"circuit":[38,55],"tracing":[39],"work":[40],"has":[41],"relied":[42],"on":[43,142],"ad-hoc":[44],"human":[45],"interpretation":[46],"role":[49,98,133],"that":[50],"feature":[52,101,136],"in":[53,167],"plays,":[56],"via":[57,102],"manual":[58],"inspection":[59],"data":[61],"artifacts":[62],"such":[63],"as":[64],"dataset":[66],"examples":[67],"component":[69],"activates":[70],"on.":[71],"We":[72,109,138],"introduce":[73,91,111],"\\textbf{ADAG},":[74],"an":[75,120],"end-to-end":[76],"pipeline":[77],"for":[78,116,162],"describing":[79],"these":[80,135],"attribution":[81],"graphs":[82],"is":[84],"fully":[85],"automated.":[86],"To":[87],"achieve":[88],"this,":[89],"we":[90],"\\textit{attribution":[92],"profiles}":[93],"quantify":[95],"functional":[97,132],"its":[103],"input":[104],"gradient":[107],"effects.":[108],"then":[110],"novel":[113],"clustering":[114],"algorithm":[115],"grouping":[117],"features,":[118],"LLM":[121],"explainer--simulator":[122],"setup":[123],"generates":[125],"scores":[127],"natural-language":[128],"explanations":[129],"groups.":[137],"run":[139],"our":[140],"system":[141],"known":[143],"human-analysed":[144],"circuit-tracing":[145],"tasks":[146],"recover":[148],"interpretable":[149],"circuits,":[150],"further":[152],"show":[153],"ADAG":[154],"can":[155],"find":[156],"steerable":[157],"clusters":[158],"are":[160],"responsible":[161],"harmful":[164],"advice":[165],"jailbreak":[166],"Llama":[168],"3.1":[169],"8B":[170],"Instruct.":[171]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-11T00:00:00"}
