{"id":"https://openalex.org/W7154483052","doi":"https://doi.org/10.48550/arxiv.2604.12191","title":"Beyond Scores: Diagnostic LLM Evaluation via Fine-Grained Abilities","display_name":"Beyond Scores: Diagnostic LLM Evaluation via Fine-Grained Abilities","publication_year":2026,"publication_date":"2026-04-14","ids":{"openalex":"https://openalex.org/W7154483052","doi":"https://doi.org/10.48550/arxiv.2604.12191"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.12191","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12191","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.12191","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133721166","display_name":"Xu Zhang","orcid":"https://orcid.org/0009-0003-6736-4941"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Xu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133664528","display_name":"Xudong Gong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Xudong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133712453","display_name":"Jiacheng Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Jiacheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133665221","display_name":"Qiang Wang","orcid":"https://orcid.org/0000-0001-6686-6630"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133628233","display_name":"JiaQi Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liao, JiaQi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100775481","display_name":"Zhe Wang","orcid":"https://orcid.org/0000-0002-2231-1606"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040096562","display_name":"Dawei Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Dawei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133701059","display_name":"Bo Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Bo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5133721166"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.39500001072883606,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.39500001072883606,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10467","display_name":"Psychometric Methodologies and Testing","score":0.2134999930858612,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.05310000106692314,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5507000088691711},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.4474000036716461},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4426000118255615},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.43320000171661377},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.4318000078201294},{"id":"https://openalex.org/keywords/model-selection","display_name":"Model selection","score":0.3939000070095062},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.36410000920295715},{"id":"https://openalex.org/keywords/taxonomy","display_name":"Taxonomy (biology)","score":0.34599998593330383}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.586899995803833},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5705000162124634},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5507000088691711},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5356000065803528},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.4474000036716461},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4426000118255615},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.43320000171661377},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.4318000078201294},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.3939000070095062},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.36410000920295715},{"id":"https://openalex.org/C58642233","wikidata":"https://www.wikidata.org/wiki/Q8269924","display_name":"Taxonomy (biology)","level":2,"score":0.34599998593330383},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.34139999747276306},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.31700000166893005},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.3073999881744385},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.29910001158714294},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.29089999198913574},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.27619999647140503},{"id":"https://openalex.org/C2776085556","wikidata":"https://www.wikidata.org/wiki/Q183361","display_name":"Chen","level":2,"score":0.275299996137619},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2702000141143799},{"id":"https://openalex.org/C142853389","wikidata":"https://www.wikidata.org/wiki/Q744778","display_name":"Association (psychology)","level":2,"score":0.26930001378059387},{"id":"https://openalex.org/C119653847","wikidata":"https://www.wikidata.org/wiki/Q1327780","display_name":"Elementary cognitive task","level":3,"score":0.2637999951839447},{"id":"https://openalex.org/C2776434776","wikidata":"https://www.wikidata.org/wiki/Q19246213","display_name":"Domain adaptation","level":3,"score":0.2549999952316284}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.12191","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12191","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.12191","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12191","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.8593518733978271,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Current":[0],"evaluations":[1],"of":[2,84,90,110,164],"large":[3],"language":[4],"models":[5],"aggregate":[6],"performance":[7,85,142],"across":[8,43,105,127,136],"diverse":[9],"tasks":[10],"into":[11],"single":[12],"scores.":[13],"This":[14,155],"obscures":[15],"fine-grained":[16,45,76,162],"ability":[17,53,77,103],"variation,":[18],"limiting":[19],"targeted":[20,170],"model":[21,41,173],"improvement":[22],"and":[23,59,107,122,150,175],"ability-guided":[24,172],"selection":[25],"for":[26,161],"specific":[27],"tasks.":[28],"Motivated":[29],"by":[30],"this":[31],"gap,":[32],"we":[33,49],"propose":[34],"a":[35,51,158],"cognitive":[36,57],"diagnostic":[37,141],"framework":[38,63,134,160],"that":[39],"estimates":[40,104],"abilities":[42],"multiple":[44],"dimensions.":[46],"For":[47],"mathematics,":[48],"construct":[50],"35-dimensional":[52],"taxonomy":[54],"grounded":[55],"in":[56,80,143,169],"theory":[58],"domain":[60],"knowledge.":[61],"The":[62,133],"employs":[64],"multidimensional":[65],"Item":[66],"Response":[67],"Theory":[68],"with":[69,113,166],"an":[70],"item-ability":[71],"association":[72],"matrix":[73],"to":[74,118,125],"estimate":[75],"levels,":[78],"which":[79],"turn":[81],"enable":[82],"prediction":[83,109],"on":[86,93],"unseen":[87,111],"items":[88,112],"(questions":[89],"benchmark).":[91],"Evaluated":[92],"41":[94],"models,":[95],"our":[96],"approach":[97],"demonstrates":[98],"strong":[99],"criterion":[100],"validity,":[101],"consistent":[102,140],"benchmarks,":[106,128],"accurate":[108],"AUC":[114],"ranging":[115],"from":[116,123],"0.80":[117],"0.89":[119],"within":[120],"benchmarks":[121],"0.77":[124],"0.86":[126],"substantially":[129],"exceeding":[130],"trivial":[131],"baselines.":[132],"generalizes":[135],"scientific":[137],"domains,":[138],"producing":[139],"physics":[144],"(27":[145],"dimensions),":[146,149],"chemistry":[147],"(58":[148],"computer":[151],"science":[152],"(12":[153],"dimensions).":[154],"work":[156],"establishes":[157],"principled":[159],"assessment":[163],"abilities,":[165],"potential":[167],"applications":[168],"training,":[171],"selection,":[174],"ability-aware":[176],"benchmark":[177],"design.":[178]},"counts_by_year":[],"updated_date":"2026-04-29T09:16:38.111599","created_date":"2026-04-16T00:00:00"}
