{"id":"https://openalex.org/W7154497122","doi":"https://doi.org/10.48550/arxiv.2604.12312","title":"CompliBench: Benchmarking LLM Judges for Compliance Violation Detection in Dialogue Systems","display_name":"CompliBench: Benchmarking LLM Judges for Compliance Violation Detection in Dialogue Systems","publication_year":2026,"publication_date":"2026-04-14","ids":{"openalex":"https://openalex.org/W7154497122","doi":"https://doi.org/10.48550/arxiv.2604.12312"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.12312","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12312","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.12312","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133695449","display_name":"Jingbo Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Jingbo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004714781","display_name":"Guanyu Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Guanyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128680252","display_name":"Bairu Hou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hou, Bairu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067954036","display_name":"Xinghan Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Xinghan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040402537","display_name":"Nikolai Glushnev","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Glushnev, Nikolai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120757953","display_name":"Iwona Bialynicka-Birula","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bialynicka-Birula, Iwona","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110183841","display_name":"Duo Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Duo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133680859","display_name":"Shiyu Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chang, Shiyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.45739999413490295,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.45739999413490295,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.07289999723434448,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12128","display_name":"AI in Service Interactions","score":0.04179999977350235,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.7178000211715698},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5931000113487244},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5404999852180481},{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.5268999934196472},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.4970000088214874},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.48559999465942383},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4593999981880188},{"id":"https://openalex.org/keywords/guideline","display_name":"Guideline","score":0.41530001163482666}],"concepts":[{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.7178000211715698},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6625999808311462},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5931000113487244},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5404999852180481},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.5268999934196472},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.4970000088214874},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.48559999465942383},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.46380001306533813},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4593999981880188},{"id":"https://openalex.org/C2780182762","wikidata":"https://www.wikidata.org/wiki/Q1630279","display_name":"Guideline","level":2,"score":0.41530001163482666},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.3889999985694885},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38589999079704285},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.35659998655319214},{"id":"https://openalex.org/C195094911","wikidata":"https://www.wikidata.org/wiki/Q14167904","display_name":"Process management","level":1,"score":0.3506999909877777},{"id":"https://openalex.org/C2781460075","wikidata":"https://www.wikidata.org/wiki/Q1399332","display_name":"Compliance (psychology)","level":2,"score":0.3481999933719635},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.3465999960899353},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3458000123500824},{"id":"https://openalex.org/C2775941552","wikidata":"https://www.wikidata.org/wiki/Q25212305","display_name":"Isolation (microbiology)","level":2,"score":0.33820000290870667},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.31369999051094055},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.31349998712539673},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.3057999908924103},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.29319998621940613},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.2865000069141388},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.28119999170303345},{"id":"https://openalex.org/C155911762","wikidata":"https://www.wikidata.org/wiki/Q422321","display_name":"Blueprint","level":2,"score":0.28110000491142273},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.27970001101493835},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.260699987411499},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.25600001215934753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.12312","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12312","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.12312","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12312","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.8170687556266785,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"Large":[1],"Language":[2],"Models":[3],"(LLMs)":[4],"are":[5,153],"increasingly":[6],"deployed":[7],"as":[8,197],"task-oriented":[9],"agents":[10],"in":[11,41,106],"enterprise":[12],"environments,":[13],"ensuring":[14],"their":[15],"strict":[16],"adherence":[17],"to":[18,54,93,100,190],"complex,":[19],"domain-specific":[20],"operational":[21],"guidelines":[22],"is":[23,29,51],"critical.":[24],"While":[25],"utilizing":[26],"an":[27,145,198],"LLM-as-a-Judge":[28],"a":[30,58,89,115,175],"promising":[31],"solution":[32],"for":[33,135,201],"scalable":[34],"evaluation,":[35],"the":[36,55,68,76,95,136,140],"reliability":[37],"of":[38,57,71,78,97],"these":[39,150],"judges":[40,99],"detecting":[42],"specific":[43],"policy":[44],"violations":[45,105],"remains":[46],"largely":[47],"unexplored.":[48],"This":[49],"gap":[50],"primarily":[52],"due":[53],"lack":[56],"systematic":[59],"data":[60,111,118,183],"generation":[61,119],"method,":[62],"which":[63],"has":[64],"been":[65],"hindered":[66],"by":[67],"extensive":[69],"cost":[70],"fine-grained":[72],"human":[73],"annotation":[74],"and":[75,102,139,187],"difficulty":[77],"synthesizing":[79],"realistic":[80],"agent":[81],"violations.":[82],"In":[83,170],"this":[84,168],"paper,":[85],"we":[86,113,172],"introduce":[87],"CompliBench,":[88],"novel":[90],"benchmark":[91],"designed":[92],"evaluate":[94],"ability":[96],"LLM":[98],"detect":[101],"localize":[103],"guideline":[104,138],"multi-turn":[107],"dialogues.":[108],"To":[109],"overcome":[110],"scarcity,":[112],"develop":[114],"scalable,":[116],"automated":[117],"pipeline":[120,196],"that":[121,160,174],"simulates":[122],"user-agent":[123],"interactions.":[124],"Our":[125,156],"controllable":[126],"flaw":[127],"injection":[128],"process":[129],"automatically":[130],"yields":[131],"precise":[132],"ground-truth":[133],"labels":[134],"violated":[137],"exact":[141],"conversation":[142],"turn,":[143],"while":[144],"adversarial":[146],"search":[147],"method":[148],"ensures":[149],"introduced":[151],"perturbations":[152],"highly":[154],"challenging.":[155],"comprehensive":[157],"evaluation":[158],"reveals":[159],"current":[161],"state-of-the-art":[162],"proprietary":[163],"LLMs":[164,186],"struggle":[165],"significantly":[166],"with":[167],"task.":[169],"addition,":[171],"demonstrate":[173],"small-scale":[176],"judge":[177],"model":[178],"fine-tuned":[179],"on":[180],"our":[181,195],"synthesized":[182],"outperforms":[184],"leading":[185],"generalizes":[188],"well":[189],"unseen":[191],"business":[192],"domains,":[193],"highlighting":[194],"effective":[199],"foundation":[200],"training":[202],"robust":[203],"generative":[204],"reward":[205],"models.":[206]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-16T00:00:00"}
