{"id":"https://openalex.org/W7156363542","doi":"https://doi.org/10.48550/arxiv.2604.22555","title":"Using Embedding Models to Improve Probabilistic Race Prediction","display_name":"Using Embedding Models to Improve Probabilistic Race Prediction","publication_year":2026,"publication_date":"2026-04-24","ids":{"openalex":"https://openalex.org/W7156363542","doi":"https://doi.org/10.48550/arxiv.2604.22555"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.22555","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22555","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.22555","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134665357","display_name":"Noan Dasanaike","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dasanaike, Noah","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5015451961","display_name":"Kosuke Imai","orcid":"https://orcid.org/0000-0002-2748-1022"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Imai, Kosuke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.3653999865055084,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.3653999865055084,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12970","display_name":"Names, Identity, and Discrimination Research","score":0.2946000099182129,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.05429999902844429,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/geocoding","display_name":"Geocoding","score":0.816100001335144},{"id":"https://openalex.org/keywords/race","display_name":"Race (biology)","score":0.6912999749183655},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6335999965667725},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.5878999829292297},{"id":"https://openalex.org/keywords/census","display_name":"Census","score":0.546999990940094},{"id":"https://openalex.org/keywords/bayesian-probability","display_name":"Bayesian probability","score":0.4968999922275543},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.36059999465942383}],"concepts":[{"id":"https://openalex.org/C42629822","wikidata":"https://www.wikidata.org/wiki/Q1346408","display_name":"Geocoding","level":2,"score":0.816100001335144},{"id":"https://openalex.org/C76509639","wikidata":"https://www.wikidata.org/wiki/Q918036","display_name":"Race (biology)","level":2,"score":0.6912999749183655},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6335999965667725},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5942000150680542},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.5878999829292297},{"id":"https://openalex.org/C52130261","wikidata":"https://www.wikidata.org/wiki/Q39825","display_name":"Census","level":3,"score":0.546999990940094},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.4968999922275543},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45339998602867126},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.36059999465942383},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3529999852180481},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.3109000027179718},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3089999854564667},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.295199990272522},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C160234255","wikidata":"https://www.wikidata.org/wiki/Q812535","display_name":"Bayesian inference","level":3,"score":0.2867000102996826},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.27869999408721924},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C33724603","wikidata":"https://www.wikidata.org/wiki/Q812540","display_name":"Bayesian network","level":2,"score":0.2687999904155731},{"id":"https://openalex.org/C207609745","wikidata":"https://www.wikidata.org/wiki/Q4944086","display_name":"Bootstrapping (finance)","level":2,"score":0.25290000438690186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.22555","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22555","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.22555","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22555","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6025013327598572,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Estimating":[0],"racial":[1],"disparity":[2],"requires":[3],"individual-level":[4],"race":[5,116,177],"data,":[6],"which":[7,31,92],"are":[8,195],"often":[9],"unavailable":[10],"due":[11],"to":[12,97,114],"the":[13,53,123,180,184,198],"sensitivity":[14],"of":[15,52],"collecting":[16],"such":[17,66],"information.":[18],"To":[19,83],"address":[20,84],"this":[21,85],"problem,":[22],"many":[23],"researchers":[24],"utilize":[25],"Bayesian":[26],"Improved":[27],"Surname":[28],"Geocoding":[29],"(BISG),":[30],"have":[32],"critically":[33],"relied":[34],"on":[35,75,107,156],"Census":[36,109,199],"surname":[37,110,139,144],"data.":[38],"Unfortunately,":[39],"these":[40,81],"data":[41,113,159],"capture":[42],"race-surname":[43],"relationships":[44],"only":[45,132],"for":[46,63,118,141,188],"common":[47],"surnames,":[48,133],"omitting":[49],"approximately":[50],"10%":[51],"US":[54],"population.":[55],"We":[56,125,169],"show":[57,170],"that":[58,163,171],"predictive":[59],"performance":[60],"degrades":[61],"substantially":[62],"individuals":[64],"with":[65,179],"omitted,":[67],"uncommon":[68],"surnames":[69,194],"because":[70],"standard":[71,129],"BISG":[72,90,130],"implementation":[73],"relies":[74],"a":[76,152],"uninformative":[77],"generic":[78],"prior":[79],"in":[80,122],"cases.":[82],"limitation,":[86],"we":[87],"propose":[88],"embedding-powered":[89],"(eBISG),":[91],"uses":[93],"pre-trained":[94],"text":[95],"embeddings":[96],"represent":[98],"names":[99,119],"as":[100],"dense":[101],"vectors":[102],"and":[103,111,145,151,190],"trains":[104],"neural":[105],"networks":[106],"2020":[108],"first-name":[112],"estimate":[115],"probabilities":[117],"not":[120],"covered":[121],"Census.":[124],"compare":[126],"five":[127],"approaches:":[128],"using":[131],"BIFSG":[134],"incorporating":[135],"first":[136,146],"name":[137,147,167],"probabilities,":[138],"embedding":[140,148,154,182],"unlisted":[142],"names,":[143],"combining":[149],"both,":[150],"full-name":[153,181],"trained":[155],"voter":[157],"file":[158],"from":[160,197],"Southern":[161],"states":[162],"captures":[164],"interactions":[165],"between":[166],"components.":[168],"each":[172],"successive":[173],"eBISG":[174],"approach":[175],"improves":[176],"prediction,":[178],"yielding":[183],"largest":[185],"gains,":[186],"particularly":[187],"Hispanic":[189],"Asian":[191],"voters":[192],"whose":[193],"absent":[196],"list.":[200]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-28T00:00:00"}
