{"id":"https://openalex.org/W7127170724","doi":"https://doi.org/10.48550/arxiv.2601.22548","title":"Are LLM Evaluators Really Narcissists? Sanity Checking Self-Preference Evaluations","display_name":"Are LLM Evaluators Really Narcissists? Sanity Checking Self-Preference Evaluations","publication_year":2026,"publication_date":"2026-01-30","ids":{"openalex":"https://openalex.org/W7127170724","doi":"https://doi.org/10.48550/arxiv.2601.22548"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.22548","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124837751","display_name":"Dani Roytburg","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roytburg, Dani","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5117630218","display_name":"Matthew Bozoukov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bozoukov, Matthew","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124818088","display_name":"Matthew Nguyen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen, Matthew","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Barzdukas, Jou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Barzdukas, Jou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Puig-Hall, Mackenzie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Puig-Hall, Mackenzie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125966297","display_name":"Narmeen Oozeer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Oozeer, Narmeen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.32409998774528503,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.32409998774528503,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.12479999661445618,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.1177000030875206,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/entropy","display_name":"Entropy (arrow of time)","score":0.46299999952316284},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.45249998569488525},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4480000138282776},{"id":"https://openalex.org/keywords/narcissism","display_name":"Narcissism","score":0.4072999954223633},{"id":"https://openalex.org/keywords/voting","display_name":"Voting","score":0.37229999899864197},{"id":"https://openalex.org/keywords/data-quality","display_name":"Data quality","score":0.3677000105381012},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.3292999863624573}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6219000220298767},{"id":"https://openalex.org/C106301342","wikidata":"https://www.wikidata.org/wiki/Q4117933","display_name":"Entropy (arrow of time)","level":2,"score":0.46299999952316284},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.45249998569488525},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4480000138282776},{"id":"https://openalex.org/C505070042","wikidata":"https://www.wikidata.org/wiki/Q186529","display_name":"Narcissism","level":2,"score":0.4072999954223633},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.4009000062942505},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3822000026702881},{"id":"https://openalex.org/C520049643","wikidata":"https://www.wikidata.org/wiki/Q189760","display_name":"Voting","level":3,"score":0.37229999899864197},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.3677000105381012},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3292999863624573},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.3287000060081482},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3165000081062317},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2922999858856201},{"id":"https://openalex.org/C96608239","wikidata":"https://www.wikidata.org/wiki/Q1199823","display_name":"Statistical power","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.27390000224113464},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.27219998836517334},{"id":"https://openalex.org/C2777607469","wikidata":"https://www.wikidata.org/wiki/Q2915214","display_name":"Sanity","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2554999887943268},{"id":"https://openalex.org/C87007009","wikidata":"https://www.wikidata.org/wiki/Q210832","display_name":"Statistical hypothesis testing","level":2,"score":0.25270000100135803}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.22548","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.22548","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.22548","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.22548","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"research":[1,169],"has":[2],"shown":[3],"that":[4,113,123],"large":[5],"language":[6],"models":[7],"(LLMs)":[8],"favor":[9],"their":[10,88,91],"own":[11],"outputs":[12,99],"when":[13,68],"acting":[14],"as":[15],"judges,":[16],"undermining":[17],"the":[18,69,111,121,153,185],"integrity":[19],"of":[20,45,84,87,142,155,188],"automated":[21],"post-training":[22],"and":[23,192],"evaluation":[24,33,159],"workflows.":[25],"However,":[26],"it":[27,124],"is":[28,90],"difficult":[29],"to":[30,72,184],"disentangle":[31],"which":[32,54,74,109],"biases":[34],"are":[35],"explained":[36],"by":[37,59,172],"narcissism":[38],"versus":[39,157],"general":[40],"experimental":[41],"confounds,":[42],"distorting":[43],"measurements":[44],"self-preference":[46,95,171],"bias.":[47],"We":[48],"discover":[49],"a":[50,114],"core":[51],"methodological":[52],"confound":[53],"could":[55],"reduce":[56],"measurement":[57],"error":[58],"89.6%.":[60],"Specifically,":[61],"LLM":[62,162],"evaluators":[63],"may":[64],"deliver":[65],"self-preferring":[66],"verdicts":[67],"judge":[70,115],"responds":[71],"queries":[73],"they":[75],"completed":[76],"incorrectly":[77,116],"themselves;":[78],"this":[79,134,181],"would":[80],"be":[81],"true":[82],"regardless":[83],"whether":[85],"one":[86],"responses":[89],"own.":[92],"To":[93],"decouple":[94],"signals":[96],"from":[97,130,161,176],"noisy":[98,174],"on":[100,137,170,190],"hard":[101],"problems,":[102],"we":[103,149],"introduce":[104],"an":[105,127],"Evaluator":[106],"Quality":[107],"Baseline,":[108],"compares":[110],"probability":[112,122],"votes":[117,125,160],"for":[118,126],"itself":[119],"against":[120],"incorrect":[128],"response":[129],"another":[131],"model.":[132],"Evaluating":[133],"simple":[135],"baseline":[136,166],"37,448":[138],"queries,":[139],"only":[140],"51%":[141],"initial":[143],"findings":[144],"retain":[145],"statistical":[146],"significance.":[147],"Finally,":[148],"turn":[150],"towards":[151],"characterizing":[152],"entropy":[154],"\"easy\"":[156],"\"hard\"":[158],"judges.":[163],"Our":[164],"corrective":[165],"enables":[167],"future":[168],"eliminating":[173],"data":[175],"potential":[177],"solutions.":[178],"More":[179],"widely,":[180],"work":[182,189],"contributes":[183],"growing":[186],"body":[187],"cataloging":[191],"isolating":[193],"judge-bias":[194],"effects.":[195]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-03T00:00:00"}
