{"id":"https://openalex.org/W2900207013","doi":"https://doi.org/10.3233/mas-180447","title":"Data preparation and fuzzy matching techniques for improved statistical modeling","display_name":"Data preparation and fuzzy matching techniques for improved statistical modeling","publication_year":2018,"publication_date":"2018-10-31","ids":{"openalex":"https://openalex.org/W2900207013","doi":"https://doi.org/10.3233/mas-180447","mag":"2900207013"},"language":"en","primary_location":{"id":"doi:10.3233/mas-180447","is_oa":false,"landing_page_url":"https://doi.org/10.3233/mas-180447","pdf_url":null,"source":{"id":"https://openalex.org/S2765066696","display_name":"Model Assisted Statistics and Applications","issn_l":"1574-1699","issn":["1574-1699","1875-9068"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310318577","host_organization_name":"IOS Press","host_organization_lineage":["https://openalex.org/P4310318577"],"host_organization_lineage_names":["IOS Press"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Model Assisted Statistics and Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5012742236","display_name":"Stephen Sloan","orcid":"https://orcid.org/0000-0002-4910-3487"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Stephen Sloan","raw_affiliation_strings":["Cream Ridge, NJ 08514, USA"],"affiliations":[{"raw_affiliation_string":"Cream Ridge, NJ 08514, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5026255237","display_name":"Kirk Paul Lafler","orcid":null},"institutions":[{"id":"https://openalex.org/I2802826586","display_name":"Spring Valley Hospital","ror":"https://ror.org/03bh3se42","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I2802826586"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kirk Paul Lafler","raw_affiliation_strings":["Spring Valley, CA 91978, USA","E-mail:","Fax: +1 609 758 5240"],"affiliations":[{"raw_affiliation_string":"Spring Valley, CA 91978, USA","institution_ids":["https://openalex.org/I2802826586"]},{"raw_affiliation_string":"E-mail:","institution_ids":[]},{"raw_affiliation_string":"Fax: +1 609 758 5240","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5012742236"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.1629,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.60043929,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":"13","issue":"4","first_page":"367","last_page":"375"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13820","display_name":"SAS software applications and methods","score":0.9908999800682068,"subfield":{"id":"https://openalex.org/subfields/2204","display_name":"Biomedical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.828482985496521},{"id":"https://openalex.org/keywords/identifier","display_name":"Identifier","score":0.7502599954605103},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.6686879992485046},{"id":"https://openalex.org/keywords/merge","display_name":"Merge (version control)","score":0.6247490644454956},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.5630701780319214},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5319759249687195},{"id":"https://openalex.org/keywords/schema-matching","display_name":"Schema matching","score":0.45722758769989014},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.4338954985141754},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3800385594367981},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.3659037947654724},{"id":"https://openalex.org/keywords/data-integration","display_name":"Data integration","score":0.247873455286026},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.14151909947395325}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.828482985496521},{"id":"https://openalex.org/C154504017","wikidata":"https://www.wikidata.org/wiki/Q853614","display_name":"Identifier","level":2,"score":0.7502599954605103},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.6686879992485046},{"id":"https://openalex.org/C197129107","wikidata":"https://www.wikidata.org/wiki/Q1921621","display_name":"Merge (version control)","level":2,"score":0.6247490644454956},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.5630701780319214},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5319759249687195},{"id":"https://openalex.org/C2777327318","wikidata":"https://www.wikidata.org/wiki/Q1408390","display_name":"Schema matching","level":3,"score":0.45722758769989014},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.4338954985141754},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3800385594367981},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.3659037947654724},{"id":"https://openalex.org/C72634772","wikidata":"https://www.wikidata.org/wiki/Q386824","display_name":"Data integration","level":2,"score":0.247873455286026},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.14151909947395325},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C59822182","wikidata":"https://www.wikidata.org/wiki/Q441","display_name":"Botany","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.3233/mas-180447","is_oa":false,"landing_page_url":"https://doi.org/10.3233/mas-180447","pdf_url":null,"source":{"id":"https://openalex.org/S2765066696","display_name":"Model Assisted Statistics and Applications","issn_l":"1574-1699","issn":["1574-1699","1875-9068"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310318577","host_organization_name":"IOS Press","host_organization_lineage":["https://openalex.org/P4310318577"],"host_organization_lineage_names":["IOS Press"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Model Assisted Statistics and Applications","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":2,"referenced_works":["https://openalex.org/W2023843553","https://openalex.org/W3124980638"],"related_works":["https://openalex.org/W4378651134","https://openalex.org/W1979633005","https://openalex.org/W2471940468","https://openalex.org/W147534183","https://openalex.org/W2352307597","https://openalex.org/W2163724607","https://openalex.org/W1980092392","https://openalex.org/W4226075635","https://openalex.org/W4318812462","https://openalex.org/W2118976126"],"abstract_inverted_index":{"Data":[0],"comes":[1],"in":[2,10,251],"all":[3,18],"forms,":[4],"shapes,":[5],"sizes":[6],"and":[7,12,25,29,58,64,115,128,146,172,185,194,199,205,208,217,237,241],"complexities.":[8],"Stored":[9],"files":[11,98,110,120],"data":[13,22,62,97,109,170,192,195,197,201,224],"sets,":[14],"SAS\u00ae":[15],"users":[16,90],"know":[17],"too":[19],"well":[20],"that":[21,93,273],"can":[23,99,266],"be,":[24],"often":[26],"is,":[27],"problematic":[28],"plagued":[30],"with":[31,113],"a":[32,60,105,164],"variety":[33],"of":[34,84,187,211,263],"issues.":[35],"Although":[36,246],"today\u2019s":[37],"statistical":[38,73,88,178],"software":[39,89,271],"programs":[40],"are":[41,45,77,91,254],"extremely":[42],"powerful,":[43],"they":[44],"typically":[46],"not":[47,122,261],"design":[48],"ed":[49],"to":[50,68,70,130,143,168,213,269],"overcome":[51],"poor":[52],"quality":[53],"data.":[54],"This":[55,161],"paper":[56,162,253],"describes":[57,163],"recommends":[59],"comprehensive":[61],"preparation":[63,171],"fuzzy":[65,173,243],"matching":[66,174,189,228,244],"process":[67],"follow":[69],"enable":[71],"improved":[72,177],"modeling.":[74,179],"Statistical":[75],"techniques":[76,212,249,265],"also":[78],"available":[79],"for":[80,176],"comparing":[81],"the":[82,85,108,144,183,234,238,247,264],"results":[83],"process.":[86],"Most":[87],"aware":[92],"two":[94],"or":[95,102,126,138,154,158,222],"more":[96],"be":[100,131,151,267],"joined,":[101],"combined,":[103],"without":[104],"problem":[106],"when":[107],"have":[111,123],"identifiers":[112,149],"unique":[114,124],"reliable":[116],"values.":[117],"However,":[118],"many":[119],"do":[121],"identifiers,":[125],"\u201ckeys\u201d,":[127],"need":[129],"joined":[132],"using":[133,229,256],"character":[134],"values,":[135],"like":[136,233],"names":[137],"E-mail":[139],"addresses.":[140],"To":[141],"add":[142],"difficulty":[145],"confusion,":[147],"these":[148],"might":[150],"spelled":[152],"differently,":[153],"use":[155],"different":[156],"abbreviation":[157],"capitalization":[159],"protocols.":[160],"versatile":[165],"6-step":[166],"approach":[167],"handling":[169],"issues":[175],"The":[180],"steps":[181],"include":[182],"identification":[184],"understanding":[186],"potential":[188],"scenarios;":[190],"exploring":[191],"values":[193],"types;":[196],"cleaning":[198],"validation;":[200],"transformation;":[202],"traditional":[203],"merge":[204],"join":[206,216],"techniques;":[207],"an":[209],"assortment":[210],"successfully":[214],"merge,":[215],"match":[218],"less":[219],"than":[220],"perfect,":[221],"\u201cmessy\u201d,":[223],"by":[225],"doing":[226],"phonetic":[227],"special-purpose":[230],"character-handling":[231,275],"functions":[232],"SOUNDEX":[235],"algorithm,":[236],"SPEDIS,":[239],"COMPLEV,":[240],"COMPGED":[242],"functions.":[245],"programming":[248],"described":[250],"this":[252],"illustrated":[255],"SAS":[257],"code,":[258],"many,":[259],"if":[260],"most,":[262],"applied":[268],"any":[270],"platform":[272],"supports":[274],"capabilities.":[276]},"counts_by_year":[{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2020,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
