{"id":"https://openalex.org/W4414497827","doi":"https://doi.org/10.1016/j.caeai.2025.100481","title":"Evaluating large language models as raters in large-scale writing assessments: A psychometric framework for reliability and validity","display_name":"Evaluating large language models as raters in large-scale writing assessments: A psychometric framework for reliability and validity","publication_year":2025,"publication_date":"2025-09-25","ids":{"openalex":"https://openalex.org/W4414497827","doi":"https://doi.org/10.1016/j.caeai.2025.100481"},"language":"en","primary_location":{"id":"doi:10.1016/j.caeai.2025.100481","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.caeai.2025.100481","pdf_url":null,"source":{"id":"https://openalex.org/S4210183364","display_name":"Computers and Education Artificial Intelligence","issn_l":"2666-920X","issn":["2666-920X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computers and Education: Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1016/j.caeai.2025.100481","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101778577","display_name":"Yuehan Wang","orcid":"https://orcid.org/0000-0001-8651-4720"},"institutions":[{"id":"https://openalex.org/I115592961","display_name":"Jiangsu University","ror":"https://ror.org/03jc41j30","country_code":"CN","type":"education","lineage":["https://openalex.org/I115592961"]},{"id":"https://openalex.org/I4210122545","display_name":"Center for Assessment","ror":"https://ror.org/02pdqrx22","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210122545"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Yuehan Wang","raw_affiliation_strings":["Evidence-based Research Center for Educational Assessment (ERCEA), Jiangsu University, Zhenjiang, China","School of Teacher Education, Jiangsu University, Zhenjiang, China"],"affiliations":[{"raw_affiliation_string":"Evidence-based Research Center for Educational Assessment (ERCEA), Jiangsu University, Zhenjiang, China","institution_ids":["https://openalex.org/I4210122545"]},{"raw_affiliation_string":"School of Teacher Education, Jiangsu University, Zhenjiang, China","institution_ids":["https://openalex.org/I115592961"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044277116","display_name":"Jinyan Huang","orcid":"https://orcid.org/0000-0003-1641-6580"},"institutions":[{"id":"https://openalex.org/I115592961","display_name":"Jiangsu University","ror":"https://ror.org/03jc41j30","country_code":"CN","type":"education","lineage":["https://openalex.org/I115592961"]},{"id":"https://openalex.org/I4210122545","display_name":"Center for Assessment","ror":"https://ror.org/02pdqrx22","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210122545"]}],"countries":["CN","US"],"is_corresponding":true,"raw_author_name":"Jinyan Huang","raw_affiliation_strings":["Evidence-based Research Center for Educational Assessment (ERCEA), Jiangsu University, Zhenjiang, China","School of Teacher Education, Jiangsu University, Zhenjiang, China"],"affiliations":[{"raw_affiliation_string":"Evidence-based Research Center for Educational Assessment (ERCEA), Jiangsu University, Zhenjiang, China","institution_ids":["https://openalex.org/I4210122545"]},{"raw_affiliation_string":"School of Teacher Education, Jiangsu University, Zhenjiang, China","institution_ids":["https://openalex.org/I115592961"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052534353","display_name":"Liang Du","orcid":"https://orcid.org/0000-0002-2663-0751"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lun Du","raw_affiliation_strings":["Ant Research, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Ant Research, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101090264","display_name":"Yuxin Guo","orcid":null},"institutions":[{"id":"https://openalex.org/I115592961","display_name":"Jiangsu University","ror":"https://ror.org/03jc41j30","country_code":"CN","type":"education","lineage":["https://openalex.org/I115592961"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuxin Guo","raw_affiliation_strings":["School of Teacher Education, Jiangsu University, Zhenjiang, China"],"affiliations":[{"raw_affiliation_string":"School of Teacher Education, Jiangsu University, Zhenjiang, China","institution_ids":["https://openalex.org/I115592961"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044610505","display_name":"Xingcheng Liu","orcid":"https://orcid.org/0000-0003-1836-2205"},"institutions":[{"id":"https://openalex.org/I115592961","display_name":"Jiangsu University","ror":"https://ror.org/03jc41j30","country_code":"CN","type":"education","lineage":["https://openalex.org/I115592961"]},{"id":"https://openalex.org/I4210122545","display_name":"Center for Assessment","ror":"https://ror.org/02pdqrx22","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210122545"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Ying Liu","raw_affiliation_strings":["Evidence-based Research Center for Educational Assessment (ERCEA), Jiangsu University, Zhenjiang, China","School of Teacher Education, Jiangsu University, Zhenjiang, China"],"affiliations":[{"raw_affiliation_string":"Evidence-based Research Center for Educational Assessment (ERCEA), Jiangsu University, Zhenjiang, China","institution_ids":["https://openalex.org/I4210122545"]},{"raw_affiliation_string":"School of Teacher Education, Jiangsu University, Zhenjiang, China","institution_ids":["https://openalex.org/I115592961"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100324885","display_name":"Rong Wang","orcid":"https://orcid.org/0000-0002-2307-709X"},"institutions":[{"id":"https://openalex.org/I115592961","display_name":"Jiangsu University","ror":"https://ror.org/03jc41j30","country_code":"CN","type":"education","lineage":["https://openalex.org/I115592961"]},{"id":"https://openalex.org/I4210122545","display_name":"Center for Assessment","ror":"https://ror.org/02pdqrx22","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210122545"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Rong Wang","raw_affiliation_strings":["Evidence-based Research Center for Educational Assessment (ERCEA), Jiangsu University, Zhenjiang, China","School of Teacher Education, Jiangsu University, Zhenjiang, China"],"affiliations":[{"raw_affiliation_string":"Evidence-based Research Center for Educational Assessment (ERCEA), Jiangsu University, Zhenjiang, China","institution_ids":["https://openalex.org/I4210122545"]},{"raw_affiliation_string":"School of Teacher Education, Jiangsu University, Zhenjiang, China","institution_ids":["https://openalex.org/I115592961"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5044277116"],"corresponding_institution_ids":["https://openalex.org/I115592961","https://openalex.org/I4210122545"],"apc_list":{"value":1800,"currency":"USD","value_usd":1800},"apc_paid":{"value":1800,"currency":"USD","value_usd":1800},"fwci":10.4495,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.97990877,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":"9","issue":null,"first_page":"100481","last_page":"100481"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9726999998092651,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9726999998092651,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.9722999930381775,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9713000059127808,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.8154000043869019},{"id":"https://openalex.org/keywords/generalizability-theory","display_name":"Generalizability theory","score":0.6862000226974487},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.6226000189781189},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.5853999853134155},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5065000057220459},{"id":"https://openalex.org/keywords/rasch-model","display_name":"Rasch model","score":0.4007999897003174}],"concepts":[{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.8154000043869019},{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.6862000226974487},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.6226000189781189},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.5853999853134155},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.5230000019073486},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5065000057220459},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.4271000027656555},{"id":"https://openalex.org/C101266164","wikidata":"https://www.wikidata.org/wiki/Q2131821","display_name":"Rasch model","level":2,"score":0.4007999897003174},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.36959999799728394},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3521000146865845},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3483000099658966},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3346000015735626},{"id":"https://openalex.org/C2777898490","wikidata":"https://www.wikidata.org/wiki/Q17157236","display_name":"Writing assessment","level":2,"score":0.29109999537467957},{"id":"https://openalex.org/C75630572","wikidata":"https://www.wikidata.org/wiki/Q538904","display_name":"Applied psychology","level":1,"score":0.28949999809265137},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2883000075817108},{"id":"https://openalex.org/C12174686","wikidata":"https://www.wikidata.org/wiki/Q1058438","display_name":"Risk assessment","level":2,"score":0.2727000117301941},{"id":"https://openalex.org/C61863361","wikidata":"https://www.wikidata.org/wiki/Q470749","display_name":"Inter-rater reliability","level":3,"score":0.25949999690055847},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.2531000077724457},{"id":"https://openalex.org/C171606756","wikidata":"https://www.wikidata.org/wiki/Q506132","display_name":"Psychometrics","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1016/j.caeai.2025.100481","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.caeai.2025.100481","pdf_url":null,"source":{"id":"https://openalex.org/S4210183364","display_name":"Computers and Education Artificial Intelligence","issn_l":"2666-920X","issn":["2666-920X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computers and Education: Artificial Intelligence","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:6075a4f863714eb891a981b9e48ef2fe","is_oa":true,"landing_page_url":"https://doaj.org/article/6075a4f863714eb891a981b9e48ef2fe","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Computers and Education: Artificial Intelligence, Vol 9, Iss , Pp 100481- (2025)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1016/j.caeai.2025.100481","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.caeai.2025.100481","pdf_url":null,"source":{"id":"https://openalex.org/S4210183364","display_name":"Computers and Education Artificial Intelligence","issn_l":"2666-920X","issn":["2666-920X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computers and Education: Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":51,"referenced_works":["https://openalex.org/W104765354","https://openalex.org/W1578803171","https://openalex.org/W1987548299","https://openalex.org/W2007255935","https://openalex.org/W2036410403","https://openalex.org/W2088910496","https://openalex.org/W2096879700","https://openalex.org/W2123484281","https://openalex.org/W2123555614","https://openalex.org/W2186524655","https://openalex.org/W2243274251","https://openalex.org/W2494203807","https://openalex.org/W2567547739","https://openalex.org/W2573498121","https://openalex.org/W2610254499","https://openalex.org/W3163354334","https://openalex.org/W3201077663","https://openalex.org/W3217154021","https://openalex.org/W4224017529","https://openalex.org/W4287448183","https://openalex.org/W4289100992","https://openalex.org/W4366403195","https://openalex.org/W4380353763","https://openalex.org/W4382618460","https://openalex.org/W4384807869","https://openalex.org/W4387848774","https://openalex.org/W4389395904","https://openalex.org/W4389431401","https://openalex.org/W4393952013","https://openalex.org/W4396806295","https://openalex.org/W4398143508","https://openalex.org/W4399334961","https://openalex.org/W4399363436","https://openalex.org/W4399364148","https://openalex.org/W4399528455","https://openalex.org/W4400138640","https://openalex.org/W4400608580","https://openalex.org/W4400668042","https://openalex.org/W4402036612","https://openalex.org/W4402819599","https://openalex.org/W4402923129","https://openalex.org/W4404281933","https://openalex.org/W4404784146","https://openalex.org/W4407830893","https://openalex.org/W4407842016","https://openalex.org/W4408292186","https://openalex.org/W4411112974","https://openalex.org/W4411630288","https://openalex.org/W4412888616","https://openalex.org/W4412897867","https://openalex.org/W4413942294"],"related_works":[],"abstract_inverted_index":{"In":[0],"large-scale":[1],"international":[2],"writing":[3],"assessments,":[4,267],"human":[5,39,127,177],"raters":[6,42,130,178],"often":[7],"exhibit":[8],"inconsistency,":[9],"undermining":[10],"reliability":[11,25,109],"and":[12,33,40,128,176,226,299,304],"validity.":[13],"Large":[14],"language":[15],"models":[16,98,175,222,269,293,300],"(LLMs)":[17],"offer":[18],"a":[19,239],"potential":[20],"solution,":[21],"but":[22,71],"their":[23],"assessment":[24,236],"remains":[26],"underexplored.":[27],"This":[28],"study":[29],"employed":[30],"generalizability":[31],"theory":[32],"many-facet":[34],"Rasch":[35],"modeling":[36],"to":[37,104,118,234],"compare":[38],"LLM":[41,129],"across":[43],"three":[44],"essay":[45],"genres":[46],"(4,315":[47],"samples).":[48],"Findings":[49],"reveal":[50],"that":[51],"human-LLM":[52],"discrepancies":[53],"stem":[54],"from":[55,159],"fundamental":[56],"evaluation":[57,207],"differences,":[58],"with":[59,73,135,151,162,220,251,278],"minimal":[60],"divergence":[61],"in":[62,67,110,253,259,265,270,289,294],"key-point":[63,111],"scoring.":[64,112],"Humans":[65,263,286],"excel":[66,204,264],"holistic":[68],"scoring":[69,101,133,156,245,260,303],"scenarios":[70],"struggle":[72,150],"complex":[74,153],"analytical":[75],"rubrics":[76,115],"where":[77,183],"LLMs":[78,82,117,138,203,248],"demonstrate":[79],"advantages.":[80],"While":[81],"perform":[83],"adequately":[84],"for":[85,93,242],"relative":[86],"ranking":[87,254],"tasks,":[88],"they":[89],"remain":[90],"less":[91],"reliable":[92],"absolute":[94],"standard":[95],"judgments.":[96],"Claude":[97,227],"exhibited":[99,180],"superior":[100],"stability":[102],"compared":[103],"GPT":[105,174,221],"models,":[106],"approaching":[107],"perfect":[108],"Detailed":[113],"hierarchical":[114],"enabled":[116],"achieve":[119],"human-comparable":[120],"consistency":[121],"even":[122],"on":[123,140],"subjective":[124],"dimensions.":[125,192],"Both":[126,297],"demonstrated":[131],"random":[132],"behaviors":[134],"different":[136],"patterns.":[137],"rely":[139],"surface":[141],"similarities":[142],"rather":[143],"than":[144,218,283],"deep":[145],"semantic":[146],"understanding,":[147],"while":[148,202,256,268],"humans":[149,197,213,252,279,298],"lengthy,":[152],"rubrics.":[154],"All":[155],"systems":[157],"suffered":[158],"restriction-of-range":[160],"effects,":[161,182],"model":[163],"scores":[164,171,185,217],"clustering":[165],"around":[166],"specific":[167],"rating":[168],"levels":[169],"(particularly":[170],"2-4).":[172],"Additionally,":[173],"both":[179],"halo":[181,305],"overall":[184],"were":[186],"heavily":[187],"influenced":[188],"by":[189,237],"single":[190],"dominant":[191],"Information":[193],"function":[194],"analysis":[195],"indicated":[196],"better":[198],"suit":[199],"broad-spectrum":[200],"assessment,":[201],"at":[205],"fine-grained":[206,271],"within":[208],"narrow":[209],"intervals.":[210],"Regarding":[211],"severity,":[212],"typically":[214],"assigned":[215],"higher":[216],"LLMs,":[219],"being":[223],"most":[224],"stringent":[225],"positioned":[228],"intermediately.":[229],"These":[230],"findings":[231],"contribute":[232],"significantly":[233],"educational":[235],"establishing":[238],"systematic":[240],"framework":[241],"evaluating":[243],"automated":[244],"systems.":[246],"\u2022":[247,262,273,285,296],"align":[249],"more":[250,281],"tasks":[255],"differ":[257],"greatly":[258],"tasks.":[261],"broad":[266],"distinctions.":[272],"Scoring":[274],"severity":[275],"differs":[276],"greatly,":[277],"generally":[280],"lenient":[282],"models.":[284],"score":[287],"randomly":[288],"multi-trait":[290],"analytic":[291],"scoring,":[292],"point-based.":[295],"show":[301],"conservative":[302],"effect":[306],"bias.":[307]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
