{"id":"https://openalex.org/W7133308325","doi":"https://doi.org/10.48550/arxiv.2603.01357","title":"ASTRA-bench: Evaluating Tool-Use Agent Reasoning and Action Planning with Personal User Context","display_name":"ASTRA-bench: Evaluating Tool-Use Agent Reasoning and Action Planning with Personal User Context","publication_year":2026,"publication_date":"2026-03-02","ids":{"openalex":"https://openalex.org/W7133308325","doi":"https://doi.org/10.48550/arxiv.2603.01357"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.01357","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01357","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.01357","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5029371083","display_name":"Zidi Xiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xiu, Zidi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066826683","display_name":"David Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, David Q.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127939808","display_name":"Kevin Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Kevin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051686585","display_name":"Maitrik Patel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patel, Maitrik","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127883691","display_name":"Josh Date","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Date, Josh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127923333","display_name":"Yizhe Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yizhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127950051","display_name":"Jiarui Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Jiarui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067522233","display_name":"Omar Attia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Attia, Omar","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071825172","display_name":"Raviteja Vemulapalli","orcid":"https://orcid.org/0000-0003-0425-7797"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vemulapalli, Raviteja","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123592187","display_name":"Oncel Tuzel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tuzel, Oncel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127950026","display_name":"Meng Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Meng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5017529415","display_name":"Samy Bengio","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bengio, Samy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5029371083"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.27239999175071716,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.27239999175071716,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.10989999771118164,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.08720000088214874,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/toolbox","display_name":"Toolbox","score":0.6510999798774719},{"id":"https://openalex.org/keywords/automated-planning-and-scheduling","display_name":"Automated planning and scheduling","score":0.589900016784668},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5745999813079834},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5217000246047974},{"id":"https://openalex.org/keywords/scripting-language","display_name":"Scripting language","score":0.41179999709129333},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.3889999985694885},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.3774000108242035},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.37310001254081726}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7032999992370605},{"id":"https://openalex.org/C2777655017","wikidata":"https://www.wikidata.org/wiki/Q1501161","display_name":"Toolbox","level":2,"score":0.6510999798774719},{"id":"https://openalex.org/C114073186","wikidata":"https://www.wikidata.org/wiki/Q2631895","display_name":"Automated planning and scheduling","level":2,"score":0.589900016784668},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5745999813079834},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5217000246047974},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.46299999952316284},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.41179999709129333},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.3889999985694885},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.3774000108242035},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.37310001254081726},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.3492000102996826},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.3398999869823456},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.33009999990463257},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.3140000104904175},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.31299999356269836},{"id":"https://openalex.org/C98184364","wikidata":"https://www.wikidata.org/wiki/Q1780131","display_name":"Argument (complex analysis)","level":2,"score":0.3059999942779541},{"id":"https://openalex.org/C58328972","wikidata":"https://www.wikidata.org/wiki/Q184609","display_name":"Expert system","level":2,"score":0.29190000891685486},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.289900004863739},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2888999879360199},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.2777000069618225},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.2694000005722046},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.2540000081062317}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.01357","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01357","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.01357","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01357","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Next-generation":[0],"AI":[1,130],"must":[2],"manage":[3],"vast":[4],"personal":[5,35,103],"data,":[6],"diverse":[7],"tools,":[8],"and":[9,17,41,59,64,105,118],"multi-step":[10,108],"reasoning,":[11],"yet":[12],"most":[13],"benchmarks":[14],"remain":[15],"context-free":[16],"single-turn.":[18],"We":[19,110],"present":[20],"ASTRA-bench":[21,112],"(Assistant":[22],"Skills":[23],"in":[24,55,94],"Tool-use,":[25],"Reasoning":[26],"\\&amp;":[27],"Action-planning),":[28],"a":[29,114,123],"benchmark":[30],"that":[31],"uniquely":[32],"unifies":[33],"time-evolving":[34],"context":[36,104],"with":[37,81,113],"an":[38],"interactive":[39],"toolbox":[40],"complex":[42],"user":[43],"intents.":[44],"Our":[45],"event-driven":[46],"pipeline":[47],"generates":[48],"2,413":[49],"scenarios":[50],"across":[51],"four":[52],"protagonists,":[53],"grounded":[54],"longitudinal":[56],"life":[57],"events":[58],"annotated":[60],"by":[61],"referential,":[62],"functional,":[63],"informational":[65],"complexity.":[66],"Evaluation":[67],"of":[68],"state-of-the-art":[69],"models":[70],"(e.g.,":[71],"Claude-4.5-Opus,":[72],"DeepSeek-V3.2)":[73],"reveals":[74],"significant":[75],"performance":[76],"degradation":[77],"under":[78],"high-complexity":[79],"conditions,":[80],"argument":[82],"generation":[83],"emerging":[84],"as":[85],"the":[86],"primary":[87],"bottleneck.":[88],"These":[89],"findings":[90],"expose":[91],"critical":[92],"limitations":[93],"current":[95],"agents'":[96],"ability":[97],"to":[98,121],"ground":[99],"reasoning":[100],"within":[101],"messy":[102],"orchestrate":[106],"reliable":[107],"plans.":[109],"release":[111],"full":[115],"execution":[116],"environment":[117],"evaluation":[119],"scripts":[120],"provide":[122],"diagnostic":[124],"testbed":[125],"for":[126],"developing":[127],"truly":[128],"context-aware":[129],"assistants.":[131]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-03-04T00:00:00"}
