{"id":"https://openalex.org/W7161143249","doi":"https://doi.org/10.48550/arxiv.2605.12530","title":"In-Situ Behavioral Evaluation for LLM Fairness, Not Standardized-Test Scores","display_name":"In-Situ Behavioral Evaluation for LLM Fairness, Not Standardized-Test Scores","publication_year":2026,"publication_date":"2026-04-21","ids":{"openalex":"https://openalex.org/W7161143249","doi":"https://doi.org/10.48550/arxiv.2605.12530"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.12530","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.12530","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.12530","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006403327","display_name":"Zeyu Tang","orcid":"https://orcid.org/0000-0002-4423-4728"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Zeyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136130315","display_name":"Sang T. Truong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Truong, Sang T.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136103898","display_name":"Deonna Owens","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Owens, Deonna","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032066429","display_name":"Shreyas Sharma","orcid":"https://orcid.org/0009-0000-4348-1287"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sharma, Shreyas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030278011","display_name":"Yibo Jacky Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yibo Jacky","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007251379","display_name":"Brando Miranda","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Miranda, Brando","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136144601","display_name":"Sanmi Koyejo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koyejo, Sanmi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.2703999876976013,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.2703999876976013,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.18240000307559967,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12128","display_name":"AI in Service Interactions","score":0.06650000065565109,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.8851000070571899},{"id":"https://openalex.org/keywords/variation","display_name":"Variation (astronomy)","score":0.585099995136261},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.52920001745224},{"id":"https://openalex.org/keywords/position","display_name":"Position (finance)","score":0.3831000030040741},{"id":"https://openalex.org/keywords/position-paper","display_name":"Position paper","score":0.3677000105381012},{"id":"https://openalex.org/keywords/persistence","display_name":"Persistence (discontinuity)","score":0.3671000003814697}],"concepts":[{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.8851000070571899},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.6358000040054321},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.585099995136261},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.52920001745224},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.4587000012397766},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.412200003862381},{"id":"https://openalex.org/C198082294","wikidata":"https://www.wikidata.org/wiki/Q3399648","display_name":"Position (finance)","level":2,"score":0.3831000030040741},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.37540000677108765},{"id":"https://openalex.org/C78780964","wikidata":"https://www.wikidata.org/wiki/Q7233193","display_name":"Position paper","level":2,"score":0.3677000105381012},{"id":"https://openalex.org/C2781009140","wikidata":"https://www.wikidata.org/wiki/Q7170389","display_name":"Persistence (discontinuity)","level":2,"score":0.3671000003814697},{"id":"https://openalex.org/C98447023","wikidata":"https://www.wikidata.org/wiki/Q1540351","display_name":"Social identity theory","level":3,"score":0.3652999997138977},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.32109999656677246},{"id":"https://openalex.org/C14262774","wikidata":"https://www.wikidata.org/wiki/Q4880695","display_name":"Behavior change","level":2,"score":0.3109000027179718},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.29809999465942383},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.29249998927116394},{"id":"https://openalex.org/C138496976","wikidata":"https://www.wikidata.org/wiki/Q175002","display_name":"Developmental psychology","level":1,"score":0.272599995136261},{"id":"https://openalex.org/C75630572","wikidata":"https://www.wikidata.org/wiki/Q538904","display_name":"Applied psychology","level":1,"score":0.2646999955177307},{"id":"https://openalex.org/C5570062","wikidata":"https://www.wikidata.org/wiki/Q3919817","display_name":"Behavioural sciences","level":2,"score":0.2632000148296356}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.12530","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.12530","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.12530","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.12530","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.5431960225105286,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"LLM":[0],"fairness":[1,33,45,159],"should":[2],"be":[3,21],"evaluated":[4],"through":[5],"in-situ":[6,78],"conversational":[7,67,84],"behavior":[8,79,85],"rather":[9,103],"than":[10,104],"standardized-test":[11,18,98,169],"Q&amp;A":[12],"benchmarks.":[13],"We":[14,62],"show":[15],"that":[16,69,152],"the":[17,32,39,49,52,106,118,130,168],"paradigm":[19,170],"can":[20],"structurally":[22],"unreliable:":[23],"surface-level":[24],"prompt":[25],"construction":[26],"choices,":[27],"although":[28],"entirely":[29],"orthogonal":[30],"to":[31,127],"question":[34],"being":[35],"tested,":[36],"account":[37],"for":[38,77],"majority":[40],"of":[41,93,166],"score":[42],"variance,":[43],"shift":[44],"conclusions":[46],"in":[47,56,59,158],"both":[48],"direction":[50],"and":[51,54,120,140,161],"magnitude,":[53],"result":[55],"severe":[57],"discordance":[58],"model":[60],"rankings.":[61],"develop":[63],"MAC-Fairness,":[64],"a":[65,164],"multi-agent":[66,95],"framework":[68],"embeds":[70],"controlled":[71],"variation":[72],"factors":[73],"into":[74],"multi-round":[75],"dialogue":[76],"evaluation,":[80],"examining":[81],"how":[82],"models'":[83],"shifts":[86],"when":[87],"identity":[88,141],"is":[89],"varied":[90],"as":[91,100,105],"part":[92],"natural":[94],"interaction.":[96],"Repurposing":[97],"questions":[99],"conversation":[101,135],"seeds":[102],"evaluation":[107,146,162],"instrument,":[108],"we":[109],"evaluate":[110],"position":[111],"persistence":[112],"(how":[113,123],"they":[114,125],"hold":[115],"positions,":[116],"from":[117,129],"self-perspective)":[119],"peer":[121],"receptiveness":[122],"receptive":[124],"are":[126],"peers,":[128],"other-perspective)":[131],"across":[132,155],"8":[133],"million":[134],"transcripts":[136],"spanning":[137],"multiple":[138],"models":[139],"presence":[142],"configurations.":[143],"In-situ":[144],"behavioral":[145,150],"reveals":[147],"stable,":[148],"model-specific":[149],"signatures":[151],"could":[153],"generalize":[154],"benchmarks":[156],"differing":[157],"targets":[160],"methodologies,":[163],"form":[165],"evidence":[167],"does":[171],"not":[172],"offer.":[173]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-15T00:00:00"}
