{"id":"https://openalex.org/W7155546655","doi":"https://doi.org/10.48550/arxiv.2604.21480","title":"Efficient Agent Evaluation via Diversity-Guided User Simulation","display_name":"Efficient Agent Evaluation via Diversity-Guided User Simulation","publication_year":2026,"publication_date":"2026-04-23","ids":{"openalex":"https://openalex.org/W7155546655","doi":"https://doi.org/10.48550/arxiv.2604.21480"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.21480","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21480","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.21480","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114602051","display_name":"Itay Nakash","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nakash, Itay","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057593631","display_name":"George Kour","orcid":"https://orcid.org/0000-0002-8908-3134"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kour, George","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5078882226","display_name":"Ateret Anaby-Tavor","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Anaby-Tavor, Ateret","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2184000015258789,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2184000015258789,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.11829999834299088,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.051100000739097595,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.6003999710083008},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5394999980926514},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4690000116825104},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.4442000091075897},{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.4300999939441681},{"id":"https://openalex.org/keywords/prefix","display_name":"Prefix","score":0.40689998865127563},{"id":"https://openalex.org/keywords/monte-carlo-method","display_name":"Monte Carlo method","score":0.36820000410079956}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8240000009536743},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.6003999710083008},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5394999980926514},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4690000116825104},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.461899995803833},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.4442000091075897},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.4300999939441681},{"id":"https://openalex.org/C141603448","wikidata":"https://www.wikidata.org/wiki/Q134830","display_name":"Prefix","level":2,"score":0.40689998865127563},{"id":"https://openalex.org/C19499675","wikidata":"https://www.wikidata.org/wiki/Q232207","display_name":"Monte Carlo method","level":2,"score":0.36820000410079956},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.31349998712539673},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.3061000108718872},{"id":"https://openalex.org/C41045048","wikidata":"https://www.wikidata.org/wiki/Q202843","display_name":"Linear programming","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.26840001344680786},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.2549000084400177},{"id":"https://openalex.org/C201025465","wikidata":"https://www.wikidata.org/wiki/Q11248500","display_name":"User experience design","level":2,"score":0.2526000142097473}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.21480","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21480","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.21480","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21480","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"are":[4,168],"increasingly":[5],"deployed":[6],"as":[7],"customer-facing":[8],"agents,":[9],"yet":[10],"evaluating":[11],"their":[12],"reliability":[13],"remains":[14],"challenging":[15],"due":[16],"to":[17,34,51,154],"stochastic,":[18],"multi-turn":[19],"interactions.":[20,83],"Current":[21],"evaluation":[22,130],"protocols":[23],"rely":[24],"on":[25,131,165],"linear":[26,156],"Monte":[27],"Carlo":[28],"rollouts":[29],"of":[30,69,81,102,124,163],"complete":[31],"agent-user":[32,82],"conversations":[33],"estimate":[35],"success.":[36],"However,":[37],"this":[38],"approach":[39],"is":[40],"computationally":[41],"inefficient,":[42],"repeatedly":[43],"regenerating":[44],"identical":[45],"early":[46],"prefixes,":[47],"and":[48,94,106,134,141],"often":[49],"fails":[50],"uncover":[52],"deep":[53],"failure":[54],"modes":[55],"that":[56,146],"arise":[57],"from":[58,97],"rare":[59],"user":[60,75,119],"behaviors.":[61],"We":[62],"introduce":[63],"DIVERT":[64,84,137],"(Diversity-Induced":[65],"Evaluation":[66],"via":[67],"Branching":[68],"Trajectories),":[70],"an":[71],"efficient,":[72],"snapshot-based,":[73],"coverage-guided":[74],"simulation":[76],"framework":[77,114],"for":[78],"systematic":[79],"exploration":[80,123],"captures":[85],"the":[86,113,161],"full":[87],"agent-environment":[88],"state":[89],"at":[90],"critical":[91],"decision":[92],"points":[93],"resumes":[95],"execution":[96],"these":[98],"snapshots,":[99],"enabling":[100],"reuse":[101],"shared":[103],"conversation":[104],"prefixes":[105],"reducing":[107],"redundant":[108],"computation.":[109],"From":[110],"each":[111],"junction,":[112],"branches":[115],"using":[116],"targeted,":[117],"diversity-inducing":[118],"responses,":[120],"allowing":[121],"directed":[122],"alternative":[125],"interaction":[126],"paths.":[127],"By":[128],"focusing":[129],"semantically":[132],"diverse":[133],"underexplored":[135],"trajectories,":[136],"improves":[138],"both":[139],"efficiency":[140],"coverage.":[142],"Empirical":[143],"results":[144],"show":[145],"it":[147],"discovers":[148],"more":[149],"failures":[150,167],"per":[151],"token":[152],"compared":[153],"standard":[155],"rollout":[157],"protocols,":[158],"while":[159],"expanding":[160],"set":[162],"tasks":[164],"which":[166],"identified.":[169]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-25T00:00:00"}
