{"id":"https://openalex.org/W4416036986","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.342","title":"Measuring Bias or Measuring the Task: Understanding the Brittle Nature of LLM Gender Biases","display_name":"Measuring Bias or Measuring the Task: Understanding the Brittle Nature of LLM Gender Biases","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416036986","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.342"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.342","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.342","pdf_url":"https://aclanthology.org/2025.emnlp-main.342.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.342.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5119800376","display_name":"Bufan Gao","orcid":null},"institutions":[{"id":"https://openalex.org/I161318765","display_name":"University of California, Los Angeles","ror":"https://ror.org/046rm7j60","country_code":"US","type":"education","lineage":["https://openalex.org/I161318765"]},{"id":"https://openalex.org/I40347166","display_name":"University of Chicago","ror":"https://ror.org/024mw5h28","country_code":"US","type":"education","lineage":["https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Bufan Gao","raw_affiliation_strings":["The University of Chicago University of California , Los Angeles"],"affiliations":[{"raw_affiliation_string":"The University of Chicago University of California , Los Angeles","institution_ids":["https://openalex.org/I40347166","https://openalex.org/I161318765"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5073949341","display_name":"Elisa Kreiss","orcid":null},"institutions":[{"id":"https://openalex.org/I161318765","display_name":"University of California, Los Angeles","ror":"https://ror.org/046rm7j60","country_code":"US","type":"education","lineage":["https://openalex.org/I161318765"]},{"id":"https://openalex.org/I40347166","display_name":"University of Chicago","ror":"https://ror.org/024mw5h28","country_code":"US","type":"education","lineage":["https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Elisa Kreiss","raw_affiliation_strings":["The University of Chicago University of California , Los Angeles"],"affiliations":[{"raw_affiliation_string":"The University of Chicago University of California , Los Angeles","institution_ids":["https://openalex.org/I40347166","https://openalex.org/I161318765"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5119800376"],"corresponding_institution_ids":["https://openalex.org/I161318765","https://openalex.org/I40347166"],"apc_list":null,"apc_paid":null,"fwci":4.206,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.94802114,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"6745","last_page":"6761"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12515","display_name":"Gender and Technology in Education","score":0.06040000170469284,"subfield":{"id":"https://openalex.org/subfields/3318","display_name":"Gender Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12515","display_name":"Gender and Technology in Education","score":0.06040000170469284,"subfield":{"id":"https://openalex.org/subfields/3318","display_name":"Gender Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10843","display_name":"Diversity and Career in Medicine","score":0.0560000017285347,"subfield":{"id":"https://openalex.org/subfields/3318","display_name":"Gender Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11047","display_name":"Gender Diversity and Inequality","score":0.054499998688697815,"subfield":{"id":"https://openalex.org/subfields/3318","display_name":"Gender Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/gender-bias","display_name":"Gender bias","score":0.322299987077713},{"id":"https://openalex.org/keywords/measure","display_name":"Measure (data warehouse)","score":0.31130000948905945},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.2937000095844269},{"id":"https://openalex.org/keywords/cognitive-bias","display_name":"Cognitive bias","score":0.2700999975204468},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.2660999894142151}],"concepts":[{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.48809999227523804},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.398499995470047},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.34950000047683716},{"id":"https://openalex.org/C2983427547","wikidata":"https://www.wikidata.org/wiki/Q93200","display_name":"Gender bias","level":2,"score":0.322299987077713},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.31130000948905945},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.2937000095844269},{"id":"https://openalex.org/C189216375","wikidata":"https://www.wikidata.org/wiki/Q1127759","display_name":"Cognitive bias","level":3,"score":0.2700999975204468},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.26589998602867126},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2554999887943268},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.2531000077724457},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.342","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.342","pdf_url":"https://aclanthology.org/2025.emnlp-main.342.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.342","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.342","pdf_url":"https://aclanthology.org/2025.emnlp-main.342.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416036986.pdf","grobid_xml":"https://content.openalex.org/works/W4416036986.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"LLMs":[1],"are":[2],"increasingly":[3],"applied":[4],"in":[5,71],"socially":[6],"impactful":[7],"settings,":[8],"concerns":[9],"about":[10],"gender":[11,52,69,120,147],"bias":[12,70,133,148],"have":[13],"prompted":[14],"growing":[15],"efforts":[16,24],"both":[17,101],"to":[18,124,131,135],"measure":[19],"and":[20,86,103,159,174],"mitigate":[21],"such":[22],"bias.These":[23],"often":[25],"rely":[26],"on":[27],"evaluation":[28,116],"tasks":[29],"that":[30,44,79,107,109],"differ":[31],"from":[32],"natural":[33],"language":[34],"distributions,":[35],"as":[36],"they":[37],"typically":[38],"involve":[39],"carefully":[40],"constructed":[41],"task":[42,66,98],"prompts":[43,108],"overtly":[45],"or":[46],"covertly":[47],"signal":[48],"the":[49,61,82,143,156,180],"presence":[50],"of":[51,64,145,183],"bias-related":[53],"content.In":[54],"this":[55,177],"paper,":[56],"we":[57,73],"examine":[58],"how":[59],"signaling":[60],"evaluative":[62],"purpose":[63],"a":[65,152],"impacts":[67],"measured":[68],"LLMs.Concretely,":[72],"test":[74],"models":[75],"under":[76],"prompt":[77,94],"conditions":[78],"(1)":[80],"make":[81,88],"testing":[83,167],"context":[84],"salient,":[85],"(2)":[87],"gender-focused":[89],"content":[90],"salient.We":[91],"then":[92],"assess":[93],"sensitivity":[95],"across":[96],"four":[97],"formats":[99],"with":[100,113],"token-probability":[102],"discrete-choice":[104],"metrics.We":[105],"find":[106],"more":[110],"clearly":[111],"align":[112],"(gender":[114],"bias)":[115],"framing":[117],"elicit":[118],"distinct":[119],"output":[121],"distributions":[122],"compared":[123],"less":[125],"evaluation-framed":[126],"prompts.Discrete-choice":[127],"metrics":[128],"further":[129],"tend":[130],"amplify":[132],"relative":[134],"probabilistic":[136],"measures.These":[137],"findings":[138],"do":[139],"not":[140],"only":[141],"highlight":[142],"brittleness":[144],"LLM":[146,170],"evaluations":[149],"but":[150],"open":[151],"new":[153],"puzzle":[154],"for":[155,179],"NLP":[157],"benchmarking":[158],"development":[160],"community:":[161],"To":[162],"what":[163,175],"extent":[164],"can":[165],"well-controlled":[166],"designs":[168],"trigger":[169],"\"testing":[171],"mode\"":[172],"performance,":[173],"does":[176],"mean":[178],"ecological":[181],"validity":[182],"future":[184],"benchmarks.":[185]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-11T06:11:40.159057","created_date":"2025-11-08T00:00:00"}
