{"id":"https://openalex.org/W7151497062","doi":"https://doi.org/10.48550/arxiv.2604.04323","title":"How Well Do Agentic Skills Work in the Wild: Benchmarking LLM Skill Usage in Realistic Settings","display_name":"How Well Do Agentic Skills Work in the Wild: Benchmarking LLM Skill Usage in Realistic Settings","publication_year":2026,"publication_date":"2026-04-06","ids":{"openalex":"https://openalex.org/W7151497062","doi":"https://doi.org/10.48550/arxiv.2604.04323"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.04323","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04323","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.04323","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101534884","display_name":"Yujian Liu","orcid":"https://orcid.org/0000-0002-9512-1272"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yujian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133112542","display_name":"Jiabao Ji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Jiabao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114107485","display_name":"Li An","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"An, Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133115186","display_name":"Tommi Jaakkola","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jaakkola, Tommi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133080959","display_name":"Yang Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133110142","display_name":"Shiyu Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chang, Shiyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.3368000090122223,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.3368000090122223,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.11230000108480453,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.11219999939203262,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.8797000050544739},{"id":"https://openalex.org/keywords/generality","display_name":"Generality","score":0.7261000275611877},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.5777999758720398},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5486999750137329},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.46540001034736633},{"id":"https://openalex.org/keywords/work","display_name":"Work (physics)","score":0.4593000113964081},{"id":"https://openalex.org/keywords/skills-management","display_name":"Skills management","score":0.36730000376701355}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.8797000050544739},{"id":"https://openalex.org/C2780767217","wikidata":"https://www.wikidata.org/wiki/Q5532421","display_name":"Generality","level":2,"score":0.7261000275611877},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5823000073432922},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.5777999758720398},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5486999750137329},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.46540001034736633},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.4593000113964081},{"id":"https://openalex.org/C97082442","wikidata":"https://www.wikidata.org/wiki/Q1934361","display_name":"Skills management","level":2,"score":0.36730000376701355},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3522999882698059},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.3156000077724457},{"id":"https://openalex.org/C501001295","wikidata":"https://www.wikidata.org/wiki/Q2480597","display_name":"Life skills","level":2,"score":0.29919999837875366},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.29809999465942383},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C132758656","wikidata":"https://www.wikidata.org/wiki/Q5307365","display_name":"Dreyfus model of skill acquisition","level":2,"score":0.28189998865127563},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.2786000072956085},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.27730000019073486},{"id":"https://openalex.org/C197947376","wikidata":"https://www.wikidata.org/wiki/Q5155608","display_name":"Comparability","level":2,"score":0.2718000113964081},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.04323","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04323","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.04323","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04323","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.8757489323616028,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Agent":[0],"skills,":[1],"which":[2],"are":[3,36,127,177],"reusable,":[4],"domain-specific":[5],"knowledge":[6],"artifacts,":[7],"have":[8,56,113],"become":[9,135],"a":[10,103],"popular":[11],"mechanism":[12],"for":[13,44,59,77,225],"extending":[14],"LLM-based":[15,226],"agents,":[16],"yet":[17],"formally":[18],"benchmarking":[19,27],"skill":[20,26,90,155],"usage":[21],"performance":[22,129,172],"remains":[23],"scarce.":[24],"Existing":[25],"efforts":[28],"focus":[29],"on":[30,64,192],"overly":[31],"idealized":[32],"conditions,":[33],"where":[34,97,195],"LLMs":[35],"directly":[37],"provided":[38],"with":[39,138],"hand-crafted,":[40],"narrowly-tailored":[41],"task-specific":[42],"skills":[43,63,72,101,109,126,176,224],"each":[45],"task,":[46],"whereas":[47],"in":[48,144],"many":[49],"realistic":[50,95],"settings,":[51,96],"the":[52,69,78,85,123,145,174,186,198,217,220],"LLM":[53],"agent":[54],"may":[55,73,111],"to":[57,115,207],"search":[58],"and":[60,67,110,160,163,181,190,219],"select":[61],"relevant":[62],"its":[65],"own,":[66],"even":[68],"closest":[70],"matching":[71],"not":[74,112],"be":[75],"well-tailored":[76],"task.":[79],"In":[80],"this":[81,151],"paper,":[82],"we":[83,153,164],"conduct":[84],"first":[86],"comprehensive":[87],"study":[88,154],"of":[89,106,125,178,188,201,223],"utility":[91],"under":[92],"progressively":[93],"challenging":[94,147],"agents":[98],"must":[99],"retrieve":[100],"from":[102,205],"large":[104],"collection":[105],"34k":[107],"real-world":[108],"access":[114],"any":[116],"hand-curated":[117],"skills.":[118],"Our":[119,209,228],"findings":[120],"reveal":[121],"that":[122,166],"benefits":[124],"fragile:":[128],"gains":[130],"degrade":[131],"consistently":[132],"as":[133],"settings":[134],"more":[136],"realistic,":[137],"pass":[139,199],"rates":[140],"approaching":[141],"no-skill":[142],"baselines":[143],"most":[146],"scenarios.":[148],"To":[149],"narrow":[150],"gap,":[152],"refinement":[156,168,191],"strategies,":[157],"including":[158],"query-specific":[159,167],"query-agnostic":[161],"approaches,":[162],"show":[165],"substantially":[169],"recovers":[170],"lost":[171],"when":[173],"initial":[175],"reasonable":[179],"relevance":[180],"quality.":[182],"We":[183],"further":[184],"demonstrate":[185],"generality":[187],"retrieval":[189],"Terminal-Bench":[193],"2.0,":[194],"they":[196],"improve":[197],"rate":[200],"Claude":[202],"Opus":[203],"4.6":[204],"57.7%":[206],"65.5%.":[208],"results,":[210],"consistent":[211],"across":[212],"multiple":[213],"models,":[214],"highlight":[215],"both":[216],"promise":[218],"current":[221],"limitations":[222],"agents.":[227],"code":[229],"is":[230],"available":[231],"at":[232],"https://github.com/UCSB-NLP-Chang/Skill-Usage.":[233]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-08T00:00:00"}
