{"id":"https://openalex.org/W7136145306","doi":"https://doi.org/10.3390/make8030074","title":"Towards Reliable LLM Grading Through Self-Consistency and Selective Human Review: Higher Accuracy, Less Work","display_name":"Towards Reliable LLM Grading Through Self-Consistency and Selective Human Review: Higher Accuracy, Less Work","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7136145306","doi":"https://doi.org/10.3390/make8030074"},"language":"en","primary_location":{"id":"doi:10.3390/make8030074","is_oa":true,"landing_page_url":"https://doi.org/10.3390/make8030074","pdf_url":"https://www.mdpi.com/2504-4990/8/3/74/pdf?version=1773669313","source":{"id":"https://openalex.org/S4210213891","display_name":"Machine Learning and Knowledge Extraction","issn_l":"2504-4990","issn":["2504-4990"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning and Knowledge Extraction","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.mdpi.com/2504-4990/8/3/74/pdf?version=1773669313","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114754464","display_name":"Luke Korthals","orcid":"https://orcid.org/0009-0006-2098-8679"},"institutions":[{"id":"https://openalex.org/I55106644","display_name":"Amsterdam University of Applied Sciences","ror":"https://ror.org/00y2z2s03","country_code":"NL","type":"education","lineage":["https://openalex.org/I55106644"]},{"id":"https://openalex.org/I887064364","display_name":"University of Amsterdam","ror":"https://ror.org/04dkp9463","country_code":"NL","type":"education","lineage":["https://openalex.org/I887064364"]}],"countries":["NL"],"is_corresponding":true,"raw_author_name":"Luke Korthals","raw_affiliation_strings":["Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands"],"raw_orcid":"https://orcid.org/0009-0006-2098-8679","affiliations":[{"raw_affiliation_string":"Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands","institution_ids":["https://openalex.org/I887064364","https://openalex.org/I55106644"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120628299","display_name":"Emma Akrong","orcid":"https://orcid.org/0009-0003-1726-7465"},"institutions":[{"id":"https://openalex.org/I55106644","display_name":"Amsterdam University of Applied Sciences","ror":"https://ror.org/00y2z2s03","country_code":"NL","type":"education","lineage":["https://openalex.org/I55106644"]},{"id":"https://openalex.org/I887064364","display_name":"University of Amsterdam","ror":"https://ror.org/04dkp9463","country_code":"NL","type":"education","lineage":["https://openalex.org/I887064364"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Emma Akrong","raw_affiliation_strings":["Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands"],"raw_orcid":"https://orcid.org/0009-0003-1726-7465","affiliations":[{"raw_affiliation_string":"Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands","institution_ids":["https://openalex.org/I887064364","https://openalex.org/I55106644"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125172999","display_name":"Gali Geller","orcid":null},"institutions":[{"id":"https://openalex.org/I55106644","display_name":"Amsterdam University of Applied Sciences","ror":"https://ror.org/00y2z2s03","country_code":"NL","type":"education","lineage":["https://openalex.org/I55106644"]},{"id":"https://openalex.org/I887064364","display_name":"University of Amsterdam","ror":"https://ror.org/04dkp9463","country_code":"NL","type":"education","lineage":["https://openalex.org/I887064364"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Gali Geller","raw_affiliation_strings":["Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands"],"raw_orcid":"https://orcid.org/0009-0002-9705-1180","affiliations":[{"raw_affiliation_string":"Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands","institution_ids":["https://openalex.org/I887064364","https://openalex.org/I55106644"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071071057","display_name":"Hannes Rosenbusch","orcid":"https://orcid.org/0000-0002-4983-3615"},"institutions":[{"id":"https://openalex.org/I55106644","display_name":"Amsterdam University of Applied Sciences","ror":"https://ror.org/00y2z2s03","country_code":"NL","type":"education","lineage":["https://openalex.org/I55106644"]},{"id":"https://openalex.org/I887064364","display_name":"University of Amsterdam","ror":"https://ror.org/04dkp9463","country_code":"NL","type":"education","lineage":["https://openalex.org/I887064364"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Hannes Rosenbusch","raw_affiliation_strings":["Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands"],"raw_orcid":"https://orcid.org/0000-0002-4983-3615","affiliations":[{"raw_affiliation_string":"Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands","institution_ids":["https://openalex.org/I887064364","https://openalex.org/I55106644"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067427738","display_name":"Raoul P. P. P. Grasman","orcid":"https://orcid.org/0000-0001-7458-1272"},"institutions":[{"id":"https://openalex.org/I55106644","display_name":"Amsterdam University of Applied Sciences","ror":"https://ror.org/00y2z2s03","country_code":"NL","type":"education","lineage":["https://openalex.org/I55106644"]},{"id":"https://openalex.org/I887064364","display_name":"University of Amsterdam","ror":"https://ror.org/04dkp9463","country_code":"NL","type":"education","lineage":["https://openalex.org/I887064364"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Raoul Grasman","raw_affiliation_strings":["Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands"],"raw_orcid":"https://orcid.org/0000-0001-7458-1272","affiliations":[{"raw_affiliation_string":"Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands","institution_ids":["https://openalex.org/I887064364","https://openalex.org/I55106644"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5064943915","display_name":"I. Visser","orcid":null},"institutions":[{"id":"https://openalex.org/I55106644","display_name":"Amsterdam University of Applied Sciences","ror":"https://ror.org/00y2z2s03","country_code":"NL","type":"education","lineage":["https://openalex.org/I55106644"]},{"id":"https://openalex.org/I887064364","display_name":"University of Amsterdam","ror":"https://ror.org/04dkp9463","country_code":"NL","type":"education","lineage":["https://openalex.org/I887064364"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Ingmar Visser","raw_affiliation_strings":["Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands"],"raw_orcid":"https://orcid.org/0000-0003-3855-2778","affiliations":[{"raw_affiliation_string":"Faculty of Social and Behavioural Sciences, University of Amsterdam, 1018 WB Amsterdam, The Netherlands","institution_ids":["https://openalex.org/I887064364","https://openalex.org/I55106644"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5114754464"],"corresponding_institution_ids":["https://openalex.org/I55106644","https://openalex.org/I887064364"],"apc_list":{"value":1400,"currency":"CHF","value_usd":1515},"apc_paid":{"value":1400,"currency":"CHF","value_usd":1515},"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.38968182,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"8","issue":"3","first_page":"74","last_page":"74"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.14480000734329224,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.14480000734329224,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.14309999346733093,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.11969999969005585,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/flagging","display_name":"Flagging","score":0.9413999915122986},{"id":"https://openalex.org/keywords/grading","display_name":"Grading (engineering)","score":0.7925000190734863},{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.7414000034332275},{"id":"https://openalex.org/keywords/certainty","display_name":"Certainty","score":0.5577999949455261},{"id":"https://openalex.org/keywords/troubleshooting","display_name":"Troubleshooting","score":0.3659000098705292},{"id":"https://openalex.org/keywords/inter-rater-reliability","display_name":"Inter-rater reliability","score":0.30630001425743103}],"concepts":[{"id":"https://openalex.org/C2777548347","wikidata":"https://www.wikidata.org/wiki/Q5456937","display_name":"Flagging","level":2,"score":0.9413999915122986},{"id":"https://openalex.org/C2777286243","wikidata":"https://www.wikidata.org/wiki/Q5591926","display_name":"Grading (engineering)","level":2,"score":0.7925000190734863},{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.7414000034332275},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5885999798774719},{"id":"https://openalex.org/C7493553","wikidata":"https://www.wikidata.org/wiki/Q1520777","display_name":"Certainty","level":2,"score":0.5577999949455261},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4113999903202057},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.37599998712539673},{"id":"https://openalex.org/C147494362","wikidata":"https://www.wikidata.org/wiki/Q2078905","display_name":"Troubleshooting","level":2,"score":0.3659000098705292},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3222000002861023},{"id":"https://openalex.org/C61863361","wikidata":"https://www.wikidata.org/wiki/Q470749","display_name":"Inter-rater reliability","level":3,"score":0.30630001425743103},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C2781463719","wikidata":"https://www.wikidata.org/wiki/Q2002022","display_name":"Grade inflation","level":3,"score":0.2842000126838684},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C2777230681","wikidata":"https://www.wikidata.org/wiki/Q7923820","display_name":"Vetting","level":2,"score":0.25459998846054077},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.25189998745918274}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.3390/make8030074","is_oa":true,"landing_page_url":"https://doi.org/10.3390/make8030074","pdf_url":"https://www.mdpi.com/2504-4990/8/3/74/pdf?version=1773669313","source":{"id":"https://openalex.org/S4210213891","display_name":"Machine Learning and Knowledge Extraction","issn_l":"2504-4990","issn":["2504-4990"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning and Knowledge Extraction","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:ecb5abb4f18d47249fc080caa04145d1","is_oa":true,"landing_page_url":"https://doaj.org/article/ecb5abb4f18d47249fc080caa04145d1","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Machine Learning and Knowledge Extraction, Vol 8, Iss 3, p 74 (2026)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.3390/make8030074","is_oa":true,"landing_page_url":"https://doi.org/10.3390/make8030074","pdf_url":"https://www.mdpi.com/2504-4990/8/3/74/pdf?version=1773669313","source":{"id":"https://openalex.org/S4210213891","display_name":"Machine Learning and Knowledge Extraction","issn_l":"2504-4990","issn":["2504-4990"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning and Knowledge Extraction","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7909566760063171}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7136145306.pdf","grobid_xml":"https://content.openalex.org/works/W7136145306.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,259],"(LLMs)":[3],"show":[4],"promise":[5],"for":[6,108,267],"grading":[7,154,194,199,272],"open-ended":[8],"assessments":[9],"but":[10,170,253],"still":[11],"exhibit":[12],"inconsistent":[13],"accuracy,":[14,220],"systematic":[15],"biases,":[16],"and":[17,42,49,61,78,90,112,127,136,163,215,248,281,290],"limited":[18],"reliability":[19,289],"across":[20,64],"assignments.":[21,66],"To":[22],"address":[23],"these":[24,251],"concerns,":[25],"we":[26],"introduce":[27],"SURE":[28,139],"(Selective":[29],"Uncertainty-based":[30],"Re-Evaluation),":[31],"a":[32,105,146,156,239],"human-in-the-loop":[33],"pipeline":[34],"that":[35,192,256,277],"combines":[36],"repeated":[37],"LLM":[38,115,186,242,271],"prompting,":[39],"uncertainty-based":[40],"flagging,":[41],"selective":[43,282],"human":[44,84,97,190,268,283],"regrading.":[45],"Three":[46],"LLMs\u2014gpt-4.1-nano,":[47],"gpt-5-nano,":[48],"the":[50,101,134,141,224,265,288],"open-source":[51],"gpt-oss-20b\u2014graded":[52],"answers":[53],"of":[54,138,184,223,245,270,292],"46":[55],"students":[56,226],"to":[57,74,113],"130":[58],"open":[59],"questions":[60],"coding":[62],"exercises":[63],"five":[65],"Each":[67],"student":[68],"answer":[69],"was":[70,182],"scored":[71],"20":[72,167],"times":[73],"derive":[75],"majority-voted":[76],"predictions":[77],"self-consistency-based":[79,278],"certainty":[80,110,148,177],"estimates.":[81],"We":[82,99,131],"simulated":[83],"regrading":[85,191],"by":[86,201],"flagging":[87,214],"low-certainty":[88],"cases":[89],"replacing":[91],"them":[92],"with":[93,155,221],"scores":[94],"from":[95,205,238],"four":[96,143],"graders.":[98],"used":[100],"first":[102],"assignment":[103],"as":[104,261],"training":[106],"set":[107],"tuning":[109],"thresholds":[111],"explore":[114],"output":[116,180],"diversification":[117],"via":[118],"sampling":[119],"parameters,":[120],"rubric":[121],"shuffling,":[122],"varied":[123],"personas,":[124],"multilingual":[125],"prompts,":[126],"post":[128],"hoc":[129],"ensembles.":[130],"then":[132],"evaluated":[133],"effectiveness":[135],"efficiency":[137,291],"on":[140,166,236],"other":[142],"assignments":[144],"using":[145],"fixed":[147],"threshold.":[149],"Across":[150],"assignments,":[151],"fully":[152],"automated":[153],"single":[157],"prompt":[158],"resulted":[159],"in":[160,209],"substantial":[161],"underscoring,":[162],"majority-voting":[164],"based":[165,235],"prompts":[168],"improved":[169,193,212],"did":[171],"not":[172],"eliminate":[173,264],"this":[174],"bias.":[175],"Low":[176],"(i.e.,":[178],"high":[179],"diversity)":[181],"diagnostic":[183],"incorrect":[185],"scores,":[187],"enabling":[188],"targeted":[189],"accuracy":[195],"while":[196],"reducing":[197],"manual":[198],"time":[200],"40\u201390%.":[202],"Aggregating":[203],"responses":[204],"all":[206],"three":[207],"LLMs":[208],"an":[210],"ensemble":[211,243],"certainty-based":[213],"most":[216],"consistently":[217],"approached":[218],"human-level":[219],"70\u201390%":[222],"grades":[225],"would":[227],"receive":[228],"falling":[229],"inside":[230],"human-grader":[231],"ranges.":[232],"A":[233],"reanalysis":[234],"outputs":[237],"more":[240],"diversified":[241],"comprised":[244],"gpt-5,":[246],"codestral-25.01,":[247],"llama-3.3-70b-instruct":[249],"replicated":[250],"findings":[252,275],"also":[254],"suggested":[255],"large":[257],"reasoning":[258],"such":[260],"gpt-5":[262],"might":[263],"need":[266],"oversight":[269,284],"entirely.":[273],"These":[274],"demonstrate":[276],"uncertainty":[279],"estimation":[280],"can":[285],"substantially":[286],"improve":[287],"AI-assisted":[293],"grading.":[294]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2026-03-17T00:00:00"}
