{"id":"https://openalex.org/W7151293858","doi":"https://doi.org/10.48550/arxiv.2604.03257","title":"Robust LLM Performance Certification via Constrained Maximum Likelihood Estimation","display_name":"Robust LLM Performance Certification via Constrained Maximum Likelihood Estimation","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7151293858","doi":"https://doi.org/10.48550/arxiv.2604.03257"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.03257","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03257","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.03257","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132598181","display_name":"Minghe Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shen, Minghe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133109177","display_name":"Ananth Balashankar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Balashankar, Ananth","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079422282","display_name":"Adam Fisch","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fisch, Adam","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052778643","display_name":"David Madras","orcid":"https://orcid.org/0000-0001-6817-8743"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Madras, David","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133116846","display_name":"Miguel Rodrigues","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rodrigues, Miguel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5132598181"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2770000100135803,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2770000100135803,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.16019999980926514,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.12150000035762787,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.6482999920845032},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.578499972820282},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5722000002861023},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5458999872207642},{"id":"https://openalex.org/keywords/calibration","display_name":"Calibration","score":0.5372999906539917},{"id":"https://openalex.org/keywords/estimation","display_name":"Estimation","score":0.41819998621940613},{"id":"https://openalex.org/keywords/certification","display_name":"Certification","score":0.3919000029563904}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7143999934196472},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.6482999920845032},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.578499972820282},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5722000002861023},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5458999872207642},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.5372999906539917},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.42480000853538513},{"id":"https://openalex.org/C96250715","wikidata":"https://www.wikidata.org/wiki/Q965330","display_name":"Estimation","level":2,"score":0.41819998621940613},{"id":"https://openalex.org/C46304622","wikidata":"https://www.wikidata.org/wiki/Q374814","display_name":"Certification","level":2,"score":0.3919000029563904},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.367000013589859},{"id":"https://openalex.org/C167928553","wikidata":"https://www.wikidata.org/wiki/Q1376021","display_name":"Estimation theory","level":2,"score":0.33070001006126404},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3197999894618988},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.31349998712539673},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C41426520","wikidata":"https://www.wikidata.org/wiki/Q1192065","display_name":"Point estimation","level":2,"score":0.28189998865127563},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.2502000033855438}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.03257","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03257","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.03257","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03257","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"ability":[1],"to":[2,53,160],"rigorously":[3],"estimate":[4],"the":[5,154],"failure":[6,55,135],"rates":[7,136],"of":[8,82,157],"large":[9,80],"language":[10],"models":[11],"(LLMs)":[12],"is":[13],"a":[14,25,47,72,79,108,161,166],"prerequisite":[15],"for":[16],"their":[17],"safe":[18],"deployment.":[19],"Currently,":[20],"however,":[21],"practitioners":[22],"often":[23],"face":[24],"tradeoff":[26],"between":[27],"expensive":[28],"human":[29],"gold":[30],"standards":[31],"and":[32,50,133,145,169],"potentially":[33],"severely-biased":[34],"automatic":[35],"annotation":[36],"schemes":[37],"such":[38],"as":[39],"\"LLM-as-a-Judge\"":[40],"labeling.":[41],"In":[42],"this":[43],"paper,":[44],"we":[45,164],"propose":[46],"new,":[48],"practical,":[49],"efficient":[51],"approach":[52,106],"LLM":[54,134,173],"rate":[56],"estimation":[57,62],"based":[58],"on":[59,99],"constrained":[60,139],"maximum-likelihood":[61],"(MLE).":[63],"Our":[64],"method":[65],"integrates":[66],"three":[67],"distinct":[68],"signal":[69],"sources:":[70],"(i)":[71],"small,":[73],"high-quality":[74],"human-labeled":[75],"calibration":[76,130],"set,":[77],"(ii)":[78],"corpus":[81],"LLM-judge":[83],"annotations,":[84],"and,":[85],"most":[86],"importantly,":[87],"(iii)":[88],"additional":[89],"side":[90],"information":[91],"via":[92],"domain-specific":[93],"constraints":[94],"derived":[95],"from":[96],"known":[97],"bounds":[98],"judge":[100,128],"performance":[101],"statistics.":[102],"We":[103],"validate":[104],"our":[105,138],"through":[107],"comprehensive":[109],"empirical":[110],"study,":[111],"benchmarking":[112],"it":[113],"against":[114],"state-of-the-art":[115],"baselines":[116],"like":[117],"Prediction-Powered":[118],"Inference":[119],"(PPI).":[120],"Across":[121],"diverse":[122],"experimental":[123],"regimes":[124],"--":[125,137],"spanning":[126],"varying":[127],"accuracies,":[129],"set":[131],"sizes,":[132],"MLE":[140],"consistently":[141],"delivers":[142],"more":[143],"accurate":[144],"lower-variance":[146],"estimates":[147],"than":[148],"existing":[149],"methods.":[150],"By":[151],"moving":[152],"beyond":[153],"\"black-box\"":[155],"use":[156],"automated":[158],"judges":[159],"flexible":[162],"framework,":[163],"provide":[165],"principled,":[167],"interpretable,":[168],"scalable":[170],"pathway":[171],"towards":[172],"failure-rate":[174],"certification.":[175]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-08T00:00:00"}
