{"id":"https://openalex.org/W7158550291","doi":"https://doi.org/10.48550/arxiv.2604.26607","title":"Human-in-the-Loop Benchmarking of Heterogeneous LLMs for Automated Competency Assessment in Secondary Level Mathematics","display_name":"Human-in-the-Loop Benchmarking of Heterogeneous LLMs for Automated Competency Assessment in Secondary Level Mathematics","publication_year":2026,"publication_date":"2026-04-29","ids":{"openalex":"https://openalex.org/W7158550291","doi":"https://doi.org/10.48550/arxiv.2604.26607"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.26607","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26607","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.26607","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5121674723","display_name":"Jatin Bhusal","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bhusal, Jatin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134878407","display_name":"Nancy Mahatha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mahatha, Nancy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134893617","display_name":"Aayush Acharya","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Acharya, Aayush","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130897960","display_name":"Raunak Regmi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Regmi, Raunak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5121674723"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.42800000309944153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.42800000309944153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10467","display_name":"Psychometric Methodologies and Testing","score":0.04470000043511391,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13447","display_name":"Educational and Psychological Assessments","score":0.03449999913573265,"subfield":{"id":"https://openalex.org/subfields/3204","display_name":"Developmental and Educational Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.8015000224113464},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.7555999755859375},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5379999876022339},{"id":"https://openalex.org/keywords/curriculum","display_name":"Curriculum","score":0.3734000027179718},{"id":"https://openalex.org/keywords/numeracy","display_name":"Numeracy","score":0.3578000068664551},{"id":"https://openalex.org/keywords/blueprint","display_name":"Blueprint","score":0.34769999980926514},{"id":"https://openalex.org/keywords/competence","display_name":"Competence (human resources)","score":0.3244999945163727},{"id":"https://openalex.org/keywords/internship","display_name":"Internship","score":0.31529998779296875},{"id":"https://openalex.org/keywords/grading","display_name":"Grading (engineering)","score":0.3142000138759613}],"concepts":[{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.8015000224113464},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.7555999755859375},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5379999876022339},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.43059998750686646},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.37779998779296875},{"id":"https://openalex.org/C145420912","wikidata":"https://www.wikidata.org/wiki/Q853077","display_name":"Mathematics education","level":1,"score":0.37770000100135803},{"id":"https://openalex.org/C47177190","wikidata":"https://www.wikidata.org/wiki/Q207137","display_name":"Curriculum","level":2,"score":0.3734000027179718},{"id":"https://openalex.org/C53537400","wikidata":"https://www.wikidata.org/wiki/Q140637","display_name":"Numeracy","level":3,"score":0.3578000068664551},{"id":"https://openalex.org/C155911762","wikidata":"https://www.wikidata.org/wiki/Q422321","display_name":"Blueprint","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C100521375","wikidata":"https://www.wikidata.org/wiki/Q2015382","display_name":"Competence (human resources)","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C540751848","wikidata":"https://www.wikidata.org/wiki/Q6500754","display_name":"Internship","level":2,"score":0.31529998779296875},{"id":"https://openalex.org/C2777286243","wikidata":"https://www.wikidata.org/wiki/Q5591926","display_name":"Grading (engineering)","level":2,"score":0.3142000138759613},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.30889999866485596},{"id":"https://openalex.org/C2777655017","wikidata":"https://www.wikidata.org/wiki/Q1501161","display_name":"Toolbox","level":2,"score":0.30480000376701355},{"id":"https://openalex.org/C64543145","wikidata":"https://www.wikidata.org/wiki/Q162942","display_name":"Intersection (aeronautics)","level":2,"score":0.30300000309944153},{"id":"https://openalex.org/C194051981","wikidata":"https://www.wikidata.org/wiki/Q1337691","display_name":"Economic shortage","level":3,"score":0.29670000076293945},{"id":"https://openalex.org/C42475967","wikidata":"https://www.wikidata.org/wiki/Q194292","display_name":"Operations research","level":1,"score":0.2962000072002411},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C169087156","wikidata":"https://www.wikidata.org/wiki/Q2131593","display_name":"Framing (construction)","level":2,"score":0.28439998626708984},{"id":"https://openalex.org/C202269582","wikidata":"https://www.wikidata.org/wiki/Q2644277","display_name":"Complementarity (molecular biology)","level":2,"score":0.2833999991416931},{"id":"https://openalex.org/C190248442","wikidata":"https://www.wikidata.org/wiki/Q839486","display_name":"Qualitative research","level":2,"score":0.2741999924182892},{"id":"https://openalex.org/C173633133","wikidata":"https://www.wikidata.org/wiki/Q2110223","display_name":"Raw score","level":3,"score":0.262800008058548},{"id":"https://openalex.org/C168725872","wikidata":"https://www.wikidata.org/wiki/Q991663","display_name":"Sophistication","level":2,"score":0.2615000009536743},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.2578999996185303},{"id":"https://openalex.org/C95423123","wikidata":"https://www.wikidata.org/wiki/Q622178","display_name":"Workgroup","level":2,"score":0.2547000050544739},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.25360000133514404},{"id":"https://openalex.org/C2777526511","wikidata":"https://www.wikidata.org/wiki/Q691543","display_name":"Pace","level":2,"score":0.25189998745918274},{"id":"https://openalex.org/C2778139618","wikidata":"https://www.wikidata.org/wiki/Q13440398","display_name":"Workforce","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.26607","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26607","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.26607","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26607","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.8374456167221069,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"Competency-Based":[1],"Education":[2],"(CBE)":[3],"is":[4,19],"gaining":[5],"traction":[6],"around":[7],"the":[8,10,28,39,51,132,144,163],"world,":[9],"shift":[11],"from":[12],"marks-based":[13],"assessment":[14],"to":[15,37],"qualitative":[16],"competency":[17],"mapping":[18],"a":[20,33,61,111,127,193],"manual":[21],"challenge":[22],"for":[23,64,180,188],"educators.":[24],"This":[25],"paper":[26],"tackles":[27],"bottleneck":[29],"issue":[30],"by":[31,115],"suggesting":[32,155],"\"Human-in-the-Loop\"":[34,194],"benchmarking":[35],"framework":[36],"assess":[38],"effectiveness":[40],"of":[41,83,165],"multiple":[42],"LLMs":[43,175],"in":[44,57,168],"automating":[45],"secondary-level":[46],"mathematics":[47,118],"assessment.":[48],"Based":[49],"on":[50],"Grade":[52],"10":[53],"Optional":[54],"Mathematics":[55],"curriculum":[56],"Nepal,":[58],"we":[59],"created":[60],"multi-dimensional":[62],"rubric":[63],"four":[65,68],"topics":[66],"and":[67,75,77,90,95,103],"cross-cutting":[69],"competencies:":[70],"Comprehension,":[71],"Knowledge,":[72],"Operational":[73],"Fluency,":[74],"Behavior":[76],"Correlation.":[78],"The":[79,124],"multi-provider":[80],"ensemble,":[81],"consisted":[82],"open-weight":[84],"models":[85,98,137],"--":[86,94],"Eagle":[87],"(Llama":[88,92],"3.1-8B)":[89],"Orion":[91,146],"3.3-70B)":[93],"proprietary":[96],"frontier":[97],"Nova":[99],"(Gemini":[100,105],"2.5":[101],"Flash)":[102],"Lyra":[104],"3":[106],"Pro),":[107],"was":[108],"benchmarked":[109],"against":[110],"ground":[112],"truth":[113],"defined":[114],"two":[116],"senior":[117],"faculty":[119],"members":[120],"(kappa_w":[121,141,152],"=":[122,153],"0.8652).":[123],"findings":[125],"show":[126],"marked":[128],"\"Architecture-compatibility":[129],"gap\".":[130],"Although":[131],"Gemini-based":[133],"Mixture-of-Experts":[134],"(Sparse":[135],"MoE)":[136],"achieved":[138],"\"Fair":[139],"Agreement\"":[140,151],"~":[142],"0.38),":[143],"larger":[145],"(70B)":[147],"model":[148],"exhibited":[149],"\"No":[150],"-0.0261),":[154],"that":[156,173],"architectural":[157],"compliance":[158],"with":[159],"instruction":[160],"constraints":[161],"outweighs":[162],"scale":[164],"raw":[166],"parameters":[167],"rubric-constrained":[169],"tasks.":[170],"We":[171],"conclude":[172],"while":[174],"are":[176],"not":[177],"yet":[178],"suitable":[179],"autonomous":[181],"certification,":[182],"they":[183],"provide":[184],"high-value":[185],"assistive":[186],"support":[187],"preliminary":[189],"evidence":[190],"extraction":[191],"within":[192],"framework.":[195]},"counts_by_year":[],"updated_date":"2026-05-01T06:10:29.291645","created_date":"2026-05-01T00:00:00"}
