{"id":"https://openalex.org/W7161698945","doi":"https://doi.org/10.48550/arxiv.2605.18548","title":"STT-Arena: A More Realistic Environment for Tool-Using with Spatio-Temporal Dynamics","display_name":"STT-Arena: A More Realistic Environment for Tool-Using with Spatio-Temporal Dynamics","publication_year":2026,"publication_date":"2026-05-18","ids":{"openalex":"https://openalex.org/W7161698945","doi":"https://doi.org/10.48550/arxiv.2605.18548"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.18548","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18548","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.18548","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5065626573","display_name":"Tingfeng Hui","orcid":"https://orcid.org/0009-0006-3524-4904"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hui, Tingfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136499923","display_name":"Hao Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Hao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136499955","display_name":"Pengyu Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Pengyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039197077","display_name":"Hongsheng Xin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xin, Hongsheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136494825","display_name":"Kun Zhan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhan, Kun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136460838","display_name":"Sen Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Sen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136460357","display_name":"Chunxiao Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Chunxiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136468357","display_name":"Ning Miao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Miao, Ning","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.1972000002861023,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.1972000002861023,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.1379999965429306,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.08139999955892563,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/forcing","display_name":"Forcing (mathematics)","score":0.7641000151634216},{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.7529000043869019},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7452999949455261},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.7074999809265137},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.6873999834060669},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.5766000151634216},{"id":"https://openalex.org/keywords/dynamics","display_name":"Dynamics (music)","score":0.5041000247001648}],"concepts":[{"id":"https://openalex.org/C197115733","wikidata":"https://www.wikidata.org/wiki/Q1003136","display_name":"Forcing (mathematics)","level":2,"score":0.7641000151634216},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.7529000043869019},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7452999949455261},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7171000242233276},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.7074999809265137},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.6873999834060669},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.5766000151634216},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.5041000247001648},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.4797999858856201},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.4196999967098236},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3427000045776367},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3409999907016754},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.323199987411499},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.2556000053882599}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.18548","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18548","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.18548","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18548","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.503390908241272,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"deployed":[4],"in":[5,34,76],"real-world":[6],"agentic":[7],"applications":[8],"must":[9],"be":[10],"capable":[11],"of":[12,42,58,109,131,137,145,151],"replanning":[13,44],"and":[14,68,101,154,177],"adapting":[15],"when":[16],"mid-task":[17],"disruptions":[18],"invalidate":[19,89],"their":[20],"prior":[21],"decisions.":[22],"Existing":[23],"dynamic":[24,133],"benchmarks":[25],"primarily":[26],"measure":[27],"whether":[28],"LLMs":[29,111,189],"can":[30,87],"detect":[31,97],"temporal":[32],"changes":[33],"a":[35,56,77,103],"timely":[36],"manner,":[37],"leaving":[38],"the":[39,94,98,115,128],"complementary":[40],"challenge":[41],"adaptive":[43],"under":[45],"spatio-temporal":[46,65,84,132],"dynamics":[47],"largely":[48],"unexplored.":[49],"We":[50],"introduce":[51],"STT-Arena":[52],"(Spatio-Temporal":[53],"Tool-Use":[54],"Arena),":[55],"benchmark":[57],"227":[59],"high-quality":[60],"interactive":[61],"tasks":[62],"spanning":[63],"nine":[64],"conflict":[66],"types":[67],"four":[69],"solvability":[70],"levels.":[71],"Each":[72],"task":[73],"is":[74],"grounded":[75],"realistic,":[78],"executable":[79],"environment":[80],"equipped":[81],"with":[82,180],"injected":[83],"triggers":[85],"that":[86,113,169],"abruptly":[88],"an":[90,164],"ongoing":[91],"plan,":[92],"forcing":[93],"model":[95],"to":[96,183],"state":[99],"shift":[100],"construct":[102],"revised":[104],"execution":[105],"strategy.":[106],"Extensive":[107],"evaluation":[108],"frontier":[110,188],"reveals":[112],"even":[114],"SOTA":[116],"proprietary":[117],"models,":[118],"including":[119],"Claude-4.6-Opus,":[120],"achieves":[121],"less":[122],"than":[123],"40\\%":[124],"overall":[125],"accuracies,":[126],"highlighting":[127],"fundamental":[129],"difficulty":[130],"reasoning.":[134],"Systematic":[135],"analysis":[136],"failure":[138,172],"trajectories":[139],"uncovers":[140],"three":[141],"recurring":[142],"error":[143],"modes":[144],"existing":[146],"models:":[147],"Stale-State":[148],"Execution,":[149],"Misdiagnosis":[150],"Dynamic":[152],"Triggers,":[153],"Missing":[155],"Post-Adaptation":[156],"Verification.":[157],"Guided":[158],"by":[159],"these":[160,171],"findings,":[161],"we":[162],"propose":[163],"iterative":[165],"trajectory":[166],"refinement":[167],"technique":[168],"eliminates":[170],"patterns":[173],"from":[174],"training":[175],"data,":[176],"combine":[178],"it":[179],"online":[181],"RL":[182],"produce":[184],"STT-Agent-4B":[185],"which":[186],"outperforms":[187],"on":[190],"STT-Arena.":[191]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
