{"id":"https://openalex.org/W7156294103","doi":"https://doi.org/10.48550/arxiv.2604.22708","title":"Seeing the Whole Elephant: A Benchmark for Failure Attribution in LLM-based Multi-Agent Systems","display_name":"Seeing the Whole Elephant: A Benchmark for Failure Attribution in LLM-based Multi-Agent Systems","publication_year":2026,"publication_date":"2026-04-24","ids":{"openalex":"https://openalex.org/W7156294103","doi":"https://doi.org/10.48550/arxiv.2604.22708"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.22708","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22708","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.22708","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006029573","display_name":"M. Chen","orcid":"https://orcid.org/0009-0006-4397-750X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Mengzhuo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134669273","display_name":"Junjie Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Junjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134739874","display_name":"Fangwen Mu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mu, Fangwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134744333","display_name":"Yawen Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yawen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134674302","display_name":"Zhe Liu","orcid":"https://orcid.org/0009-0004-3461-5166"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134684027","display_name":"Huanxiang Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Huanxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100434905","display_name":"Qing Wang","orcid":"https://orcid.org/0000-0003-4318-7867"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5006029573"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.15729999542236328,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.15729999542236328,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.11909999698400497,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.07479999959468842,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/attribution","display_name":"Attribution","score":0.8805999755859375},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.776199996471405},{"id":"https://openalex.org/keywords/debugging","display_name":"Debugging","score":0.7610999941825867},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6583999991416931},{"id":"https://openalex.org/keywords/nondeterministic-algorithm","display_name":"Nondeterministic algorithm","score":0.5856000185012817},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.4262999892234802}],"concepts":[{"id":"https://openalex.org/C143299363","wikidata":"https://www.wikidata.org/wiki/Q900584","display_name":"Attribution","level":2,"score":0.8805999755859375},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.776199996471405},{"id":"https://openalex.org/C168065819","wikidata":"https://www.wikidata.org/wiki/Q845566","display_name":"Debugging","level":2,"score":0.7610999941825867},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6783999800682068},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6583999991416931},{"id":"https://openalex.org/C176181172","wikidata":"https://www.wikidata.org/wiki/Q3490301","display_name":"Nondeterministic algorithm","level":2,"score":0.5856000185012817},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.4262999892234802},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.366100013256073},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.36010000109672546},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3393999934196472},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.32330000400543213},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.30880001187324524},{"id":"https://openalex.org/C108170787","wikidata":"https://www.wikidata.org/wiki/Q3951828","display_name":"Agency (philosophy)","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2653999924659729},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.25540000200271606}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.22708","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22708","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.22708","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22708","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.45340728759765625,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Failure":[0],"attribution,":[1],"i.e.,":[2],"identifying":[3],"the":[4,58,165],"responsible":[5],"agent":[6,55],"and":[7,28,40,60,112,163],"decisive":[8],"step":[9],"of":[10,167],"a":[11,102,136,149],"failure,":[12],"is":[13,35],"particularly":[14],"challenging":[15],"in":[16],"LLM-based":[17],"multi-agent":[18],"systems":[19],"(MAS)":[20],"due":[21],"to":[22,38,133],"their":[23],"natural-language":[24],"reasoning,":[25],"nondeterministic":[26],"outputs,":[27,56,91],"intricate":[29],"interaction":[30],"dynamics.":[31],"A":[32],"reliable":[33],"benchmark":[34,103],"therefore":[36],"essential":[37],"guide":[39],"evaluate":[41,118],"attribution":[42,72,107,120,129,154],"techniques.":[43],"Yet":[44],"existing":[45],"benchmarks":[46],"rely":[47],"on":[48],"partially":[49],"observable":[50],"traces":[51,111,127],"that":[52,62,70,140,159],"capture":[53],"only":[54,90],"omitting":[57],"inputs":[59,142],"context":[61],"developers":[63],"actually":[64],"use":[65],"when":[66],"debugging.":[67],"We":[68,115],"argue":[69],"failure":[71,106,119,145,153],"should":[73],"be":[74],"studied":[75],"under":[76],"full":[77,109,126],"execution":[78,110],"observability,":[79],"aligning":[80],"with":[81,108],"real-world":[82,161],"developer-facing":[83],"scenarios":[84],"where":[85],"complete":[86],"traces,":[87],"rather":[88],"than":[89],"are":[92],"accessible":[93],"for":[94,105,151],"diagnosis.":[95],"To":[96],"this":[97],"end,":[98],"we":[99],"introduce":[100],"TraceElephant,":[101],"designed":[104],"reproducible":[113],"environments.":[114],"then":[116],"systematically":[117],"techniques":[121],"across":[122],"various":[123],"configurations.":[124],"Specifically,":[125],"improve":[128],"accuracy":[130],"by":[131],"up":[132],"76\\%":[134],"over":[135],"partial-observation":[137],"counterpart,":[138],"confirming":[139],"missing":[141],"obscure":[143],"many":[144],"causes.":[146],"TraceElephant":[147],"provides":[148],"foundation":[150],"follow-up":[152],"research,":[155],"promoting":[156],"evaluation":[157],"practices":[158],"reflect":[160],"debugging":[162],"supporting":[164],"development":[166],"more":[168],"transparent":[169],"MASs.":[170]},"counts_by_year":[],"updated_date":"2026-04-28T06:12:00.211691","created_date":"2026-04-28T00:00:00"}
