{"id":"https://openalex.org/W7161674885","doi":"https://doi.org/10.48550/arxiv.2605.18032","title":"PROTEA: Offline Evaluation and Iterative Refinement for Multi-Agent LLM Workflows","display_name":"PROTEA: Offline Evaluation and Iterative Refinement for Multi-Agent LLM Workflows","publication_year":2026,"publication_date":"2026-05-18","ids":{"openalex":"https://openalex.org/W7161674885","doi":"https://doi.org/10.48550/arxiv.2605.18032"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.18032","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18032","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.18032","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5087364594","display_name":"Kazuki Kawamura","orcid":"https://orcid.org/0000-0002-5181-320X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kawamura, Kazuki","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136500525","display_name":"Satoshi Waki","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Waki, Satoshi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5099784235","display_name":"Kei Tateno","orcid":"https://orcid.org/0009-0000-8249-2659"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tateno, Kei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.19509999454021454,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.19509999454021454,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.13740000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.1290999948978424,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.8156999945640564},{"id":"https://openalex.org/keywords/debugging","display_name":"Debugging","score":0.796500027179718},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.6082000136375427},{"id":"https://openalex.org/keywords/node","display_name":"Node (physics)","score":0.5626999735832214},{"id":"https://openalex.org/keywords/formative-assessment","display_name":"Formative assessment","score":0.49900001287460327},{"id":"https://openalex.org/keywords/overlay","display_name":"Overlay","score":0.3644999861717224},{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.34880000352859497}],"concepts":[{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.8156999945640564},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8118000030517578},{"id":"https://openalex.org/C168065819","wikidata":"https://www.wikidata.org/wiki/Q845566","display_name":"Debugging","level":2,"score":0.796500027179718},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.6082000136375427},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.5626999735832214},{"id":"https://openalex.org/C42525527","wikidata":"https://www.wikidata.org/wiki/Q1209955","display_name":"Formative assessment","level":2,"score":0.49900001287460327},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3691999912261963},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.3644999861717224},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3546000123023987},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.35199999809265137},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.34880000352859497},{"id":"https://openalex.org/C102379954","wikidata":"https://www.wikidata.org/wiki/Q2589940","display_name":"Call graph","level":2,"score":0.3179999887943268},{"id":"https://openalex.org/C51929080","wikidata":"https://www.wikidata.org/wiki/Q2425187","display_name":"Codebase","level":3,"score":0.31520000100135803},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.29600000381469727},{"id":"https://openalex.org/C48947383","wikidata":"https://www.wikidata.org/wiki/Q830719","display_name":"Visitor pattern","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C37789001","wikidata":"https://www.wikidata.org/wiki/Q782543","display_name":"Graphical user interface","level":2,"score":0.28700000047683716},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.28290000557899475},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.27219998836517334},{"id":"https://openalex.org/C143587482","wikidata":"https://www.wikidata.org/wiki/Q1543216","display_name":"Iterative and incremental development","level":2,"score":0.259799987077713},{"id":"https://openalex.org/C200749887","wikidata":"https://www.wikidata.org/wiki/Q1165574","display_name":"System monitoring","level":2,"score":0.2565999925136566}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.18032","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18032","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.18032","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18032","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multi-agent":[0],"LLM":[1,9,178],"workflows":[2],"--":[3,11],"systems":[4,91],"composed":[5],"of":[6,60],"multiple":[7],"role-specific":[8],"calls":[10],"often":[12],"outperform":[13],"single-prompt":[14],"baselines,":[15],"but":[16],"they":[17],"remain":[18],"difficult":[19],"to":[20,35,40,48,84,141,162,169],"debug":[21],"and":[22,44,74,78,112,137,145,164,186],"refine.":[23],"Failures":[24],"can":[25],"originate":[26],"from":[27,109,160,167],"subtle":[28],"errors":[29],"in":[30],"intermediate":[31,68],"outputs":[32,70],"that":[33],"propagate":[34],"downstream":[36],"nodes,":[37,124],"requiring":[38],"developers":[39],"inspect":[41],"long":[42],"traces":[43],"infer":[45],"which":[46],"agent":[47],"modify.":[49],"We":[50],"present":[51],"PROTEA,":[52],"a":[53,65,172],"unified":[54],"interface":[55],"for":[56],"offline,":[57],"test-driven":[58],"improvement":[59],"multi-agent":[61],"workflows.":[62],"PROTEA":[63,99,125,156],"executes":[64],"workflow,":[66],"scores":[67],"node":[69,102,120],"with":[71,118,175],"configurable":[72],"rubrics,":[73],"overlays":[75],"per-node":[76,184],"states":[77],"rationales":[79],"on":[80],"the":[81,96,139,149],"workflow":[82,140],"graph":[83,113],"localize":[85],"likely":[86],"bottlenecks.":[87],"To":[88],"support":[89],"complex":[90],"where":[92],"final-answer":[93,110],"references":[94,111],"are":[95],"primary":[97],"supervision,":[98],"performs":[100],"backward":[101],"evaluation:":[103],"it":[104],"generates":[105],"candidate":[106],"node-level":[107],"expectations":[108],"context,":[114],"then":[115,134],"compares":[116],"them":[117],"observed":[119],"outputs.":[121],"For":[122],"selected":[123],"presents":[126],"targeted":[127],"prompt":[128,189],"revisions":[129],"as":[130],"editable":[131,187],"before/after":[132,188],"comparisons,":[133],"automatically":[135],"reruns":[136],"re-evaluates":[138],"show":[142],"output":[143],"changes":[144],"score":[146],"trajectories":[147],"within":[148],"same":[150],"interface.":[151],"In":[152,171],"two":[153],"production-adjacent":[154],"workflows,":[155],"improved":[157],"document-inspection":[158],"accuracy":[159],"64.3%":[161],"83.9%":[163],"recommendation":[165],"Hit@5":[166],"0.30":[168],"0.38.":[170],"formative":[173],"study":[174],"six":[176],"experienced":[177],"developers,":[179],"participants":[180],"valued":[181],"graph-level":[182],"localization,":[183],"rationales,":[185],"revisions.":[190]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
