{"id":"https://openalex.org/W7135218670","doi":"https://doi.org/10.48550/arxiv.2603.11051","title":"OpenSanctions Pairs: Large-Scale Entity Matching with LLMs","display_name":"OpenSanctions Pairs: Large-Scale Entity Matching with LLMs","publication_year":2026,"publication_date":"2026-02-24","ids":{"openalex":"https://openalex.org/W7135218670","doi":"https://doi.org/10.48550/arxiv.2603.11051"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.11051","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11051","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.11051","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046099818","display_name":"Chandler Smith","orcid":"https://orcid.org/0009-0005-5410-0247"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Smith, Chandler","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129099421","display_name":"Magnus Sesodia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sesodia, Magnus","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128995949","display_name":"Friedrich Lindenberg","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lindenberg, Friedrich","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5112436473","display_name":"Christian Schroeder de Witt","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"de Witt, Christian Schroeder","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5046099818"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9749000072479248,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9749000072479248,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.006099999882280827,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.003000000026077032,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6535999774932861},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.6295999884605408},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5771999955177307},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4961000084877014},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.4235999882221222},{"id":"https://openalex.org/keywords/sanctions","display_name":"Sanctions","score":0.41609999537467957}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6535999774932861},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.6295999884605408},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5771999955177307},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5558000206947327},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4961000084877014},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44679999351501465},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.4235999882221222},{"id":"https://openalex.org/C2778069335","wikidata":"https://www.wikidata.org/wiki/Q32098","display_name":"Sanctions","level":2,"score":0.41609999537467957},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3668999969959259},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3596999943256378},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3174000084400177},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.31520000100135803},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.3075999915599823},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.3003999888896942},{"id":"https://openalex.org/C130440534","wikidata":"https://www.wikidata.org/wiki/Q14946528","display_name":"Conflation","level":2,"score":0.2705000042915344},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.25380000472068787}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.11051","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11051","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.11051","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11051","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5099948644638062,"display_name":"Partnerships for the goals","id":"https://metadata.un.org/sdg/17"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,47],"release":[1],"OpenSanctions":[2],"Pairs,":[3],"a":[4,49,86,146],"large-scale":[5],"entity":[6],"matching":[7,142],"benchmark":[8,48],"derived":[9],"from":[10],"real-world":[11],"international":[12],"sanctions":[13],"aggregation":[14],"and":[15,33,37,40,58,63,82,109,133,152,163],"analyst":[16],"deduplication.":[17],"The":[18],"dataset":[19],"contains":[20],"755,540":[21],"labeled":[22],"pairs":[23],"spanning":[24],"293":[25],"heterogeneous":[26],"sources":[27],"across":[28],"31":[29],"countries,":[30],"with":[31,85],"multilingual":[32],"cross-script":[34,131],"names,":[35],"noisy":[36],"missing":[38],"attributes,":[39],"set-valued":[41],"fields":[42],"typical":[43],"of":[44],"compliance":[45],"workflows.":[46],"production":[50,71],"rule-based":[51,72,120],"matcher":[52],"(nomenklatura":[53],"RegressionV1":[54],"algorithm)":[55],"against":[56],"open-":[57],"closed-source":[59],"LLMs":[60,67,127],"in":[61,149],"zero-":[62],"few-shot":[64],"settings.":[65],"Off-the-shelf":[66],"substantially":[68],"outperform":[69],"the":[70,119],"baseline":[73],"(91.33\\%":[74],"F1),":[75],"reaching":[76],"up":[77],"to":[78],"98.95\\%":[79],"F1":[80,84],"(GPT-4o)":[81],"98.23\\%":[83],"locally":[87],"deployable":[88],"open":[89],"model":[90],"(DeepSeek-R1-Distill-Qwen-14B).":[91],"DSPy":[92],"MIPROv2":[93],"prompt":[94],"optimization":[95],"yields":[96],"consistent":[97],"but":[98],"modest":[99],"gains,":[100],"while":[101],"adding":[102],"in-context":[103],"examples":[104],"provides":[105],"little":[106],"additional":[107],"benefit":[108],"can":[110],"degrade":[111],"performance.":[112],"Error":[113],"analysis":[114],"shows":[115],"complementary":[116],"failure":[117],"modes:":[118],"system":[121],"over-matches":[122],"(high":[123],"false":[124],"positives),":[125],"whereas":[126],"primarily":[128],"fail":[129],"on":[130],"transliteration":[132],"minor":[134],"identifier/date":[135],"inconsistencies.":[136],"These":[137],"results":[138],"indicate":[139],"that":[140],"pairwise":[141],"performance":[143],"is":[144],"approaching":[145],"practical":[147],"ceiling":[148],"this":[150],"setting,":[151],"motivate":[153],"shifting":[154],"effort":[155],"toward":[156],"pipeline":[157],"components":[158],"such":[159],"as":[160],"blocking,":[161],"clustering,":[162],"uncertainty-aware":[164],"review.":[165],"Code":[166],"available":[167],"at":[168],"https://github.com/chansmi/OSINT_entity_resolution":[169]},"counts_by_year":[],"updated_date":"2026-03-14T06:46:50.379900","created_date":"2026-03-14T00:00:00"}
