{"id":"https://openalex.org/W7161058983","doi":"https://doi.org/10.48550/arxiv.2605.11954","title":"Assessing and Mitigating Miscalibration in LLM-Based Social Science Measurement","display_name":"Assessing and Mitigating Miscalibration in LLM-Based Social Science Measurement","publication_year":2026,"publication_date":"2026-05-12","ids":{"openalex":"https://openalex.org/W7161058983","doi":"https://doi.org/10.48550/arxiv.2605.11954"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.11954","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11954","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.11954","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125356184","display_name":"Huilin Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jinyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136042022","display_name":"Ningyuan Deng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deng, Ningyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136040401","display_name":"Yi Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.8780999779701233,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.8780999779701233,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.015599999576807022,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13398","display_name":"Data Analysis with R","score":0.010300000198185444,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/brier-score","display_name":"Brier score","score":0.5673999786376953},{"id":"https://openalex.org/keywords/calibration","display_name":"Calibration","score":0.5566999912261963},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.4830000102519989},{"id":"https://openalex.org/keywords/confidence-interval","display_name":"Confidence interval","score":0.41429999470710754},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.39500001072883606},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.38940000534057617},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.3828999996185303},{"id":"https://openalex.org/keywords/empirical-research","display_name":"Empirical research","score":0.37130001187324524},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.3635999858379364}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.608299970626831},{"id":"https://openalex.org/C35405484","wikidata":"https://www.wikidata.org/wiki/Q4967066","display_name":"Brier score","level":2,"score":0.5673999786376953},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.5566999912261963},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5110999941825867},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.4830000102519989},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.47839999198913574},{"id":"https://openalex.org/C44249647","wikidata":"https://www.wikidata.org/wiki/Q208498","display_name":"Confidence interval","level":2,"score":0.41429999470710754},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.39500001072883606},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.38940000534057617},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.38519999384880066},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.3828999996185303},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.37130001187324524},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.3635999858379364},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.35019999742507935},{"id":"https://openalex.org/C175309249","wikidata":"https://www.wikidata.org/wiki/Q725864","display_name":"Pipeline transport","level":2,"score":0.34529998898506165},{"id":"https://openalex.org/C9357733","wikidata":"https://www.wikidata.org/wiki/Q6878417","display_name":"Missing data","level":2,"score":0.33160001039505005},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.32510000467300415},{"id":"https://openalex.org/C137209882","wikidata":"https://www.wikidata.org/wiki/Q1403517","display_name":"Measurement uncertainty","level":2,"score":0.3222000002861023},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.3149999976158142},{"id":"https://openalex.org/C166052673","wikidata":"https://www.wikidata.org/wiki/Q83021","display_name":"Empirical evidence","level":2,"score":0.31130000948905945},{"id":"https://openalex.org/C32230216","wikidata":"https://www.wikidata.org/wiki/Q7882499","display_name":"Uncertainty quantification","level":2,"score":0.29339998960494995},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.29190000891685486},{"id":"https://openalex.org/C140529851","wikidata":"https://www.wikidata.org/wiki/Q5160083","display_name":"Confidence and prediction bands","level":3,"score":0.2827000021934509},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.2741999924182892},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C83546350","wikidata":"https://www.wikidata.org/wiki/Q1139051","display_name":"Regression","level":2,"score":0.26489999890327454},{"id":"https://openalex.org/C58041806","wikidata":"https://www.wikidata.org/wiki/Q1660484","display_name":"Imputation (statistics)","level":3,"score":0.2612000107765198},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.26010000705718994},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.257999986410141},{"id":"https://openalex.org/C45942800","wikidata":"https://www.wikidata.org/wiki/Q245652","display_name":"Ensemble learning","level":2,"score":0.25780001282691956},{"id":"https://openalex.org/C19619285","wikidata":"https://www.wikidata.org/wiki/Q196372","display_name":"Observational error","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.25290000438690186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.11954","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11954","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.11954","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11954","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"score":0.7720377445220947,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,157],"(LLMs)":[3],"are":[4],"increasingly":[5],"used":[6],"in":[7,56],"social":[8,58,91,179],"science":[9,59,92,180],"as":[10,185,192],"scalable":[11],"measurement":[12,47,188],"tools":[13],"for":[14,129,158],"converting":[15],"unstructured":[16],"text":[17],"into":[18,144],"variables":[19],"that":[20,39,71,177],"can":[21,75],"enter":[22],"standard":[23],"empirical":[24,43],"designs.":[25],"Measurement":[26],"validity":[27],"demands":[28],"more":[29],"than":[30,191],"high":[31],"average":[32],"accuracy,":[33],"which":[34],"requires":[35],"well":[36],"calibrated":[37],"confidence":[38,72,82,111,143],"faithfully":[40],"reflects":[41],"the":[42,53],"probability":[44],"of":[45,187],"each":[46],"being":[48],"correct.":[49],"This":[50],"paper":[51],"studies":[52],"model":[54,108],"miscalibration":[55],"LLM-based":[57,178],"measurement.":[60],"We":[61,85],"begin":[62],"with":[63,115,132],"a":[64,119,124,145,151],"case":[65],"study":[66],"on":[67,155],"FOMC":[68],"and":[69,101,107,140,170],"show":[70],"based":[73],"filtering":[74],"change":[76],"downstream":[77],"regression":[78],"estimates":[79],"when":[80],"LLM":[81,138],"is":[83,112],"miscalibrated.":[84],"then":[86,149],"audit":[87],"calibration":[88,184],"across":[89,162],"14":[90],"constructs":[93],"covering":[94],"both":[95],"proprietary":[96],"models,":[97],"including":[98],"GPT-5-mini,":[99],"DeepSeek-V3.2,":[100],"open":[102],"source":[103],"models.":[104],"Across":[105],"tasks":[106],"families,":[109],"reported":[110],"poorly":[113],"aligned":[114],"tolerance-based":[116],"correctness.":[117],"As":[118],"simple":[120],"mitigation,":[121],"we":[122],"propose":[123],"soft":[125,146],"label":[126],"distillation":[127],"pipeline":[128],"calibrating":[130],"Bert":[131],"LLM.":[133],"The":[134],"method":[135],"converts":[136],"an":[137,193],"score":[139],"its":[141],"verbalized":[142],"target":[147],"distribution,":[148],"trains":[150],"smaller":[152],"discriminative":[153],"classifier":[154],"encoder":[156],"these":[159],"targets.":[160],"Averaged":[161],"datasets,":[163],"this":[164],"approach":[165],"reduces":[166],"ECE":[167],"by":[168,172],"43.2\\%":[169],"Brier":[171],"34.0\\%.":[173],"These":[174],"results":[175],"suggest":[176],"pipelines":[181],"should":[182],"treat":[183],"part":[186],"validity,":[189],"rather":[190],"optional":[194],"post-processing":[195],"concern.":[196]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-14T00:00:00"}
