{"id":"https://openalex.org/W7127617428","doi":"https://doi.org/10.48550/arxiv.2602.02515","title":"CreditAudit: 2$^\\text{nd}$ Dimension for LLM Evaluation and Selection","display_name":"CreditAudit: 2$^\\text{nd}$ Dimension for LLM Evaluation and Selection","publication_year":2026,"publication_date":"2026-01-23","ids":{"openalex":"https://openalex.org/W7127617428","doi":"https://doi.org/10.48550/arxiv.2602.02515"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.02515","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.02515","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.02515","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101557905","display_name":"Y Song","orcid":"https://orcid.org/0000-0003-4369-1289"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Song, Yiliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045076894","display_name":"Hongjun An","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"An, Hongjun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124985787","display_name":"Jiangong Xiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao, Jiangong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125051861","display_name":"Haofei Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Haofei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124996836","display_name":"Jiawei Shao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shao, Jiawei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125036710","display_name":"Xuelong Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xuelong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5101557905"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.8799999952316284,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.8799999952316284,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.022600000724196434,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.007499999832361937,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.4909999966621399},{"id":"https://openalex.org/keywords/prioritization","display_name":"Prioritization","score":0.4523000121116638},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.4334999918937683},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.421099990606308},{"id":"https://openalex.org/keywords/model-selection","display_name":"Model selection","score":0.40790000557899475},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.3743000030517578},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.36809998750686646},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.36410000920295715}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6388999819755554},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.4909999966621399},{"id":"https://openalex.org/C2777615720","wikidata":"https://www.wikidata.org/wiki/Q11888847","display_name":"Prioritization","level":2,"score":0.4523000121116638},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.4334999918937683},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.421099990606308},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.40790000557899475},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.3743000030517578},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.36809998750686646},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.36559998989105225},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.36410000920295715},{"id":"https://openalex.org/C2780385302","wikidata":"https://www.wikidata.org/wiki/Q367158","display_name":"Protocol (science)","level":3,"score":0.35370001196861267},{"id":"https://openalex.org/C118671147","wikidata":"https://www.wikidata.org/wiki/Q578714","display_name":"Quantile","level":2,"score":0.3255999982357025},{"id":"https://openalex.org/C153701036","wikidata":"https://www.wikidata.org/wiki/Q659974","display_name":"Trustworthiness","level":2,"score":0.3098999857902527},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3077000081539154},{"id":"https://openalex.org/C2776654903","wikidata":"https://www.wikidata.org/wiki/Q2601463","display_name":"SAFER","level":2,"score":0.30329999327659607},{"id":"https://openalex.org/C91602232","wikidata":"https://www.wikidata.org/wiki/Q756115","display_name":"Volatility (finance)","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.28060001134872437},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.2775999903678894},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.274399995803833},{"id":"https://openalex.org/C2778414658","wikidata":"https://www.wikidata.org/wiki/Q1409206","display_name":"Model risk","level":3,"score":0.2574000060558319},{"id":"https://openalex.org/C81293917","wikidata":"https://www.wikidata.org/wiki/Q4189534","display_name":"System deployment","level":3,"score":0.25429999828338623}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.02515","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.02515","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.02515","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.02515","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Leaderboard":[0],"scores":[1,24],"on":[2,137],"public":[3],"benchmarks":[4],"have":[5],"been":[6],"steadily":[7],"rising":[8],"and":[9,39,46,85,102,112,140,155,173,185,191,197],"converging,":[10],"with":[11,128,146],"many":[12],"frontier":[13],"language":[14,176],"models":[15,78,145],"now":[16],"separated":[17],"by":[18],"only":[19],"marginal":[20],"differences.":[21],"However,":[22],"these":[23],"often":[25],"fail":[26],"to":[27,31,65,122],"match":[28],"users'":[29],"day":[30,32],"experience,":[33],"because":[34],"system":[35,88],"prompts,":[36],"output":[37],"protocols,":[38],"interaction":[40],"modes":[41],"evolve":[42],"under":[43,79],"routine":[44],"iteration,":[45],"in":[47,162],"agentic":[48,163],"multi":[49],"step":[50],"pipelines":[51],"small":[52],"protocol":[53],"shifts":[54],"can":[55,150,158],"trigger":[56],"disproportionate":[57],"failures,":[58],"leaving":[59],"practitioners":[60],"uncertain":[61],"about":[62],"which":[63],"model":[64,126,199],"deploy.":[66],"We":[67],"propose":[68],"CreditAudit,":[69],"a":[70,80,108,171],"deployment":[71,184],"oriented":[72],"credit":[73,118],"audit":[74],"framework":[75],"that":[76,130,144],"evaluates":[77],"family":[81],"of":[82,189],"semantically":[83],"aligned":[84],"non":[86],"adversarial":[87],"prompt":[89],"templates":[90],"across":[91,100],"multiple":[92],"benchmarks,":[93],"reporting":[94],"mean":[95,148],"ability":[96,149],"as":[97,107],"average":[98],"performance":[99],"scenarios":[101],"scenario":[103],"induced":[104],"fluctuation":[105],"sigma":[106],"stability":[109,156],"risk":[110,157],"signal,":[111],"further":[113],"mapping":[114],"volatility":[115],"into":[116],"interpretable":[117],"grades":[119],"from":[120],"AAA":[121],"BBB":[123],"via":[124],"cross":[125],"quantiles":[127],"diagnostics":[129],"mitigate":[131],"template":[132],"difficulty":[133],"drift.":[134],"Controlled":[135],"experiments":[136],"GPQA,":[138],"TruthfulQA,":[139],"MMLU":[141],"Pro":[142],"show":[143],"similar":[147],"exhibit":[151],"substantially":[152],"different":[153],"fluctuation,":[154],"overturn":[159],"prioritization":[160],"decisions":[161],"or":[164],"high":[165],"failure":[166],"cost":[167],"regimes.":[168],"By":[169],"providing":[170],"2D":[172],"grade":[174],"based":[175],"for":[177,201],"regime":[178],"specific":[179],"selection,":[180],"CreditAudit":[181],"supports":[182],"tiered":[183],"more":[186,195],"disciplined":[187],"allocation":[188],"testing":[190],"monitoring":[192],"effort,":[193],"enabling":[194],"objective":[196],"trustworthy":[198],"evaluation":[200],"real":[202],"world":[203],"use.":[204]},"counts_by_year":[],"updated_date":"2026-02-07T06:11:34.122080","created_date":"2026-02-06T00:00:00"}
