{"id":"https://openalex.org/W4412944686","doi":"https://doi.org/10.18653/v1/2025.findings-acl.1205","title":"The Behavior Gap: Evaluating Zero-shot LLM Agents in Complex Task-Oriented Dialogs","display_name":"The Behavior Gap: Evaluating Zero-shot LLM Agents in Complex Task-Oriented Dialogs","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412944686","doi":"https://doi.org/10.18653/v1/2025.findings-acl.1205"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.findings-acl.1205","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.1205","pdf_url":"https://aclanthology.org/2025.findings-acl.1205.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.findings-acl.1205.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5119204324","display_name":"Avinash Baidya","orcid":null},"institutions":[{"id":"https://openalex.org/I88773910","display_name":"Intuit (United States)","ror":"https://ror.org/049mrbr98","country_code":"US","type":"company","lineage":["https://openalex.org/I88773910"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Avinash Baidya","raw_affiliation_strings":["Intuit AI Research 2700 Coast Avenue , Mountain View , CA 94043"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Intuit AI Research 2700 Coast Avenue , Mountain View , CA 94043","institution_ids":["https://openalex.org/I88773910"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Kamalika Das","orcid":null},"institutions":[{"id":"https://openalex.org/I88773910","display_name":"Intuit (United States)","ror":"https://ror.org/049mrbr98","country_code":"US","type":"company","lineage":["https://openalex.org/I88773910"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kamalika Das","raw_affiliation_strings":["Intuit AI Research 2700 Coast Avenue , Mountain View , CA 94043"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Intuit AI Research 2700 Coast Avenue , Mountain View , CA 94043","institution_ids":["https://openalex.org/I88773910"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5105405993","display_name":"Xiang Gao","orcid":"https://orcid.org/0000-0001-9912-2101"},"institutions":[{"id":"https://openalex.org/I88773910","display_name":"Intuit (United States)","ror":"https://ror.org/049mrbr98","country_code":"US","type":"company","lineage":["https://openalex.org/I88773910"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiang Gao","raw_affiliation_strings":["Intuit AI Research 2700 Coast Avenue , Mountain View , CA 94043"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Intuit AI Research 2700 Coast Avenue , Mountain View , CA 94043","institution_ids":["https://openalex.org/I88773910"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":3.5175,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.93096734,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"23455","last_page":"23472"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.9193999767303467,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.9193999767303467,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9193000197410583,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.7343078255653381},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6900567412376404},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6887890696525574},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.667185366153717},{"id":"https://openalex.org/keywords/one-shot","display_name":"One shot","score":0.48676085472106934},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.10942569375038147},{"id":"https://openalex.org/keywords/materials-science","display_name":"Materials science","score":0.0805254876613617},{"id":"https://openalex.org/keywords/mechanical-engineering","display_name":"Mechanical engineering","score":0.07742291688919067},{"id":"https://openalex.org/keywords/systems-engineering","display_name":"Systems engineering","score":0.07527175545692444}],"concepts":[{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.7343078255653381},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6900567412376404},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6887890696525574},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.667185366153717},{"id":"https://openalex.org/C2992734406","wikidata":"https://www.wikidata.org/wiki/Q413267","display_name":"One shot","level":2,"score":0.48676085472106934},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.10942569375038147},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0805254876613617},{"id":"https://openalex.org/C78519656","wikidata":"https://www.wikidata.org/wiki/Q101333","display_name":"Mechanical engineering","level":1,"score":0.07742291688919067},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.07527175545692444},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C191897082","wikidata":"https://www.wikidata.org/wiki/Q11467","display_name":"Metallurgy","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.findings-acl.1205","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.1205","pdf_url":"https://aclanthology.org/2025.findings-acl.1205.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.findings-acl.1205","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.1205","pdf_url":"https://aclanthology.org/2025.findings-acl.1205.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412944686.pdf","grobid_xml":"https://content.openalex.org/works/W4412944686.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2497720472","https://openalex.org/W4292659306","https://openalex.org/W3044321615","https://openalex.org/W2806221744","https://openalex.org/W2326937258","https://openalex.org/W394267150","https://openalex.org/W2773965352","https://openalex.org/W4294892107","https://openalex.org/W2357748469","https://openalex.org/W2392917037"],"abstract_inverted_index":{"Large":[0],"Language":[1],"Model":[2],"(LLM)-based":[3],"agents":[4,52],"have":[5],"significantly":[6],"impacted":[7],"Task-Oriented":[8],"Dialog":[9],"Systems":[10],"(TODS)":[11],"but":[12],"continue":[13],"to":[14,45,95,152,171],"face":[15],"notable":[16],"performance":[17,28,35,80,100,154],"challenges,":[18],"especially":[19],"in":[20,59,109,178],"zero-shot":[21],"scenarios.While":[22],"prior":[23],"work":[24],"has":[25],"noted":[26],"this":[27,70],"gap,":[29],"the":[30,34,47,79,88,105,113,161,173],"behavioral":[31,165],"factors":[32],"driving":[33],"gap":[36,49,72,90],"remain":[37],"underexplored.This":[38],"study":[39,159],"proposes":[40],"a":[41,74,96,137],"comprehensive":[42,164],"evaluation":[43],"framework":[44],"quantify":[46],"behavior":[48,71,89,149],"between":[50],"AI":[51],"and":[53,64,131,142,167],"human":[54,120],"experts,":[55],"focusing":[56],"on":[57,101,157],"discrepancies":[58],"dialog":[60,127],"acts,":[61],"tool":[62,134],"usage,":[63],"knowledge":[65],"utilization.Our":[66],"findings":[67],"reveal":[68],"that":[69],"is":[73],"critical":[75],"factor":[76],"negatively":[77],"impacting":[78],"of":[81,98,140,145,163,175],"LLM":[82],"agents.Notably,":[83],"as":[84],"task":[85,108],"complexity":[86],"increases,":[87],"widens":[91],"(correlation:":[92],"0.963),":[93],"leading":[94],"degradation":[97],"agent":[99,115],"complex":[102,107,180],"task-oriented":[103],"dialogs.For":[104],"most":[106],"our":[110],"study,":[111],"even":[112],"GPT-4obased":[114],"exhibits":[116],"low":[117,123],"alignment":[118,169],"with":[119,122,136],"behavior,":[121],"F1":[124,138],"scores":[125],"for":[126],"acts":[128],"(0.464),":[129],"excessive":[130],"often":[132],"misaligned":[133],"usage":[135,144],"score":[139],"0.139,":[141],"ineffective":[143],"external":[146],"knowledge.Reducing":[147],"such":[148],"gaps":[150],"leads":[151],"significant":[153],"improvement":[155],"(24.3%":[156],"average).This":[158],"highlights":[160],"importance":[162],"evaluations":[166],"improved":[168],"strategies":[170],"enhance":[172],"effectiveness":[174],"LLMbased":[176],"TODS":[177],"handling":[179],"tasks.":[181],"1":[182]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
