{"id":"https://openalex.org/W7133546027","doi":"https://doi.org/10.48550/arxiv.2603.02586","title":"LiveAgentBench: Comprehensive Benchmarking of Agentic Systems Across 104 Real-World Challenges","display_name":"LiveAgentBench: Comprehensive Benchmarking of Agentic Systems Across 104 Real-World Challenges","publication_year":2026,"publication_date":"2026-03-03","ids":{"openalex":"https://openalex.org/W7133546027","doi":"https://doi.org/10.48550/arxiv.2603.02586"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.02586","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02586","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.02586","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128111860","display_name":"Hao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Hao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128115892","display_name":"Huan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Huan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059515406","display_name":"Jinjie Gu","orcid":"https://orcid.org/0000-0002-0577-9039"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Jinjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128092621","display_name":"Wenjie Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Wenjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128042523","display_name":"Chenyi Zhuang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuang, Chenyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128086314","display_name":"Sikang Bian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bian, Sikang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5139999985694885,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5139999985694885,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.2526000142097473,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.04230000078678131,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.8422999978065491},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.6520000100135803},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.64410001039505},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6413000226020813},{"id":"https://openalex.org/keywords/social-media","display_name":"Social media","score":0.48829999566078186},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.4417000114917755}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.8422999978065491},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6881999969482422},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.6520000100135803},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.64410001039505},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6413000226020813},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.5509999990463257},{"id":"https://openalex.org/C518677369","wikidata":"https://www.wikidata.org/wiki/Q202833","display_name":"Social media","level":2,"score":0.48829999566078186},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.4417000114917755},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.42419999837875366},{"id":"https://openalex.org/C2776505523","wikidata":"https://www.wikidata.org/wiki/Q4785468","display_name":"Plan (archaeology)","level":2,"score":0.4034999907016754},{"id":"https://openalex.org/C195094911","wikidata":"https://www.wikidata.org/wiki/Q14167904","display_name":"Process management","level":1,"score":0.38199999928474426},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.31690001487731934},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.31119999289512634},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.27630001306533813},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.27399998903274536},{"id":"https://openalex.org/C67712803","wikidata":"https://www.wikidata.org/wiki/Q7901853","display_name":"User modeling","level":3,"score":0.25699999928474426},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.02586","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02586","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.02586","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02586","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"large":[1],"language":[2],"models":[3],"grow":[4],"more":[5],"capable,":[6],"general":[7],"AI":[8],"agents":[9],"have":[10],"become":[11],"increasingly":[12],"prevalent":[13],"in":[14],"practical":[15,101],"applications.":[16],"However,":[17],"existing":[18],"benchmarks":[19],"face":[20],"significant":[21],"limitations,":[22],"failing":[23],"to":[24,62,78],"represent":[25],"real-world":[26,59,82,131],"user":[27,46],"tasks":[28],"accurately.":[29],"To":[30],"address":[31],"this":[32],"gap,":[33],"we":[34,76],"present":[35],"LiveAgentBench,":[36,98],"a":[37,73],"comprehensive":[38],"benchmark":[39],"with":[40,113,127],"104":[41],"scenarios":[42],"that":[43],"reflect":[44],"real":[45],"requirements.":[47],"It":[48],"is":[49,65],"constructed":[50],"from":[51,130],"publicly":[52],"sourced":[53],"questions":[54],"on":[55],"social":[56],"media":[57],"and":[58,86,94,103,117],"products.":[60],"Central":[61],"our":[63],"approach":[64],"the":[66],"Social":[67],"Perception-Driven":[68],"Data":[69],"Generation":[70],"(SPDG)":[71],"method,":[72],"novel":[74],"process":[75,123],"developed":[77],"ensure":[79],"each":[80],"question's":[81],"relevance,":[83],"task":[84],"complexity,":[85],"result":[87],"verifiability.":[88],"We":[89],"evaluate":[90],"various":[91],"models,":[92],"frameworks,":[93],"commercial":[95],"products":[96],"using":[97],"revealing":[99],"their":[100],"performance":[102],"identifying":[104],"areas":[105],"for":[106,115,119],"improvement.":[107],"This":[108],"release":[109],"includes":[110],"374":[111],"tasks,":[112],"125":[114],"validation":[116],"249":[118],"testing.":[120],"The":[121],"SPDG":[122],"enables":[124],"continuous":[125],"updates":[126],"fresh":[128],"queries":[129],"interactions.":[132]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-05T00:00:00"}
