{"id":"https://openalex.org/W7080324067","doi":"https://doi.org/10.48550/arxiv.2509.04676","title":"An Approach to Grounding AI Model Evaluations in Human-derived Criteria","display_name":"An Approach to Grounding AI Model Evaluations in Human-derived Criteria","publication_year":2025,"publication_date":"2025-09-04","ids":{"openalex":"https://openalex.org/W7080324067","doi":"https://doi.org/10.48550/arxiv.2509.04676"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2509.04676","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.04676","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2509.04676","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Mitts, Sasha","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Mitts, Sasha","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T12157","display_name":"Geochemistry and Geologic Mapping","score":0.4016000032424927,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12157","display_name":"Geochemistry and Geologic Mapping","score":0.4016000032424927,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13177","display_name":"Geological and Geophysical Studies","score":0.05050000175833702,"subfield":{"id":"https://openalex.org/subfields/1907","display_name":"Geology"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13067","display_name":"Geological Modeling and Analysis","score":0.031700000166893005,"subfield":{"id":"https://openalex.org/subfields/1906","display_name":"Geochemistry and Petrology"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8478999733924866},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.7228999733924866},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.5156000256538391},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.4853000044822693},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.45239999890327454},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4392000138759613},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.414900004863739}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8478999733924866},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.7228999733924866},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.576200008392334},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5728999972343445},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.5156000256538391},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.4853000044822693},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.45239999890327454},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4392000138759613},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.414900004863739},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3919999897480011},{"id":"https://openalex.org/C157170001","wikidata":"https://www.wikidata.org/wiki/Q4781507","display_name":"Applications of artificial intelligence","level":2,"score":0.3653999865055084},{"id":"https://openalex.org/C105409693","wikidata":"https://www.wikidata.org/wiki/Q5937824","display_name":"Human intelligence","level":2,"score":0.36239999532699585},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.35659998655319214},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35280001163482666},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.3111000061035156},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.3109999895095825},{"id":"https://openalex.org/C161407221","wikidata":"https://www.wikidata.org/wiki/Q4382939","display_name":"Cognitive model","level":3,"score":0.3012000024318695},{"id":"https://openalex.org/C134290984","wikidata":"https://www.wikidata.org/wiki/Q5141241","display_name":"Cognitive skill","level":3,"score":0.29440000653266907},{"id":"https://openalex.org/C207453521","wikidata":"https://www.wikidata.org/wiki/Q4801079","display_name":"Artificial intelligence, situated approach","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.28209999203681946},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.2709999978542328}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2509.04676","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.04676","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2509.04676","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.04676","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.662165641784668,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"the":[1,18,27,49,60,141,174],"rapidly":[2],"evolving":[3],"field":[4],"of":[5,21,29,53,133,143],"artificial":[6],"intelligence":[7],"(AI),":[8],"traditional":[9],"benchmarks":[10,41],"can":[11],"fall":[12],"short":[13],"in":[14,59,103,146,179],"attempting":[15],"to":[16,38,47,73,157],"capture":[17],"nuanced":[19],"capabilities":[20,160],"AI":[22,90,100,113,147,159,180],"models.":[23],"We":[24],"focus":[25],"on":[26],"case":[28],"physical":[30],"world":[31],"modeling":[32],"and":[33,51,63,70,83,91,105,135,154,172],"propose":[34],"a":[35,126],"novel":[36],"approach":[37,166],"augment":[39],"existing":[40],"with":[42,161],"human-derived":[43],"evaluation":[44,145],"criteria,":[45],"aiming":[46,156],"enhance":[48],"interpretability":[50],"applicability":[52],"model":[54,181],"behaviors.":[55],"Grounding":[56],"our":[57,119],"study":[58],"Perception":[61],"Test":[62],"OpenEQA":[64],"benchmarks,":[65],"we":[66,124],"conducted":[67],"in-depth":[68],"interviews":[69],"large-scale":[71],"surveys":[72],"identify":[74],"key":[75],"cognitive":[76,163],"skills,":[77],"such":[78],"as":[79,101],"Prioritization,":[80],"Memorizing,":[81],"Discerning,":[82],"Contextualizing,":[84],"that":[85,97],"are":[86],"critical":[87],"for":[88,112,128,152,176],"both":[89,167],"human":[92,162],"reasoning.":[93],"Our":[94,165],"findings":[95,120],"reveal":[96],"participants":[98],"perceive":[99],"lacking":[102],"interpretive":[104],"empathetic":[106],"skills":[107],"yet":[108],"hold":[109],"high":[110],"expectations":[111],"performance.":[114],"By":[115],"integrating":[116],"insights":[117],"from":[118],"into":[121],"benchmark":[122],"design,":[123],"offer":[125],"framework":[127],"developing":[129],"more":[130],"human-aligned":[131],"means":[132],"defining":[134],"measuring":[136],"progress.":[137],"This":[138],"work":[139],"underscores":[140],"importance":[142],"user-centered":[144],"development,":[148],"providing":[149],"actionable":[150],"guidelines":[151],"researchers":[153],"practitioners":[155],"align":[158],"processes.":[164],"enhances":[168],"current":[169],"benchmarking":[170],"practices":[171],"sets":[173],"stage":[175],"future":[177],"advancements":[178],"evaluation.":[182]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
