{"id":"https://openalex.org/W3177965705","doi":"https://doi.org/10.1145/3451471.3451489","title":"Analysis of Clustering Algorithms to Clean and Normalize Early Modern European Book Titles","display_name":"Analysis of Clustering Algorithms to Clean and Normalize Early Modern European Book Titles","publication_year":2021,"publication_date":"2021-01-16","ids":{"openalex":"https://openalex.org/W3177965705","doi":"https://doi.org/10.1145/3451471.3451489","mag":"3177965705"},"language":"en","primary_location":{"id":"doi:10.1145/3451471.3451489","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3451471.3451489","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 The 4th International Conference on Software Engineering and Information Management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5020684488","display_name":"Evan Bryer","orcid":null},"institutions":[{"id":"https://openalex.org/I155781252","display_name":"University of South Carolina","ror":"https://ror.org/02b6qw903","country_code":"US","type":"education","lineage":["https://openalex.org/I155781252"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Evan Bryer","raw_affiliation_strings":["University of South Carolina, United States"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of South Carolina, United States","institution_ids":["https://openalex.org/I155781252"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069165687","display_name":"Theppatorn Rhujittawiwat","orcid":"https://orcid.org/0009-0004-5800-4554"},"institutions":[{"id":"https://openalex.org/I155781252","display_name":"University of South Carolina","ror":"https://ror.org/02b6qw903","country_code":"US","type":"education","lineage":["https://openalex.org/I155781252"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Theppatorn Rhujittawiwat","raw_affiliation_strings":["University of South Carolina, United States"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of South Carolina, United States","institution_ids":["https://openalex.org/I155781252"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062232588","display_name":"Samyu Comandur","orcid":null},"institutions":[{"id":"https://openalex.org/I155781252","display_name":"University of South Carolina","ror":"https://ror.org/02b6qw903","country_code":"US","type":"education","lineage":["https://openalex.org/I155781252"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Samyu Comandur","raw_affiliation_strings":["University of South Carolina, United States"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of South Carolina, United States","institution_ids":["https://openalex.org/I155781252"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067928808","display_name":"Vasco Madrid","orcid":null},"institutions":[{"id":"https://openalex.org/I155781252","display_name":"University of South Carolina","ror":"https://ror.org/02b6qw903","country_code":"US","type":"education","lineage":["https://openalex.org/I155781252"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vasco Madrid","raw_affiliation_strings":["University of South Carolina, United States"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of South Carolina, United States","institution_ids":["https://openalex.org/I155781252"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037142237","display_name":"S. A. Riley","orcid":null},"institutions":[{"id":"https://openalex.org/I155781252","display_name":"University of South Carolina","ror":"https://ror.org/02b6qw903","country_code":"US","type":"education","lineage":["https://openalex.org/I155781252"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Stephanie Riley","raw_affiliation_strings":["University of South Carolina, United States"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of South Carolina, United States","institution_ids":["https://openalex.org/I155781252"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112054359","display_name":"John R. Rose","orcid":"https://orcid.org/0000-0001-7600-7215"},"institutions":[{"id":"https://openalex.org/I155781252","display_name":"University of South Carolina","ror":"https://ror.org/02b6qw903","country_code":"US","type":"education","lineage":["https://openalex.org/I155781252"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"John Rose","raw_affiliation_strings":["University of South Carolina, United States"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of South Carolina, United States","institution_ids":["https://openalex.org/I155781252"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5009754568","display_name":"Colin Wilder","orcid":null},"institutions":[{"id":"https://openalex.org/I155781252","display_name":"University of South Carolina","ror":"https://ror.org/02b6qw903","country_code":"US","type":"education","lineage":["https://openalex.org/I155781252"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Colin Wilder","raw_affiliation_strings":["University of South Carolina, United States"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of South Carolina, United States","institution_ids":["https://openalex.org/I155781252"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5020684488"],"corresponding_institution_ids":["https://openalex.org/I155781252"],"apc_list":null,"apc_paid":null,"fwci":0.5373,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.68762855,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"106","last_page":"112"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.989799976348877,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9495999813079834,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.7967958450317383},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.7261101007461548},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7250655889511108},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6017245054244995},{"id":"https://openalex.org/keywords/cataloging","display_name":"Cataloging","score":0.5268594026565552},{"id":"https://openalex.org/keywords/table","display_name":"Table (database)","score":0.5007684230804443},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.4241235852241516},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.22041112184524536},{"id":"https://openalex.org/keywords/library-science","display_name":"Library science","score":0.19478237628936768}],"concepts":[{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.7967958450317383},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.7261101007461548},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7250655889511108},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6017245054244995},{"id":"https://openalex.org/C2777922441","wikidata":"https://www.wikidata.org/wiki/Q3535655","display_name":"Cataloging","level":2,"score":0.5268594026565552},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.5007684230804443},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4241235852241516},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.22041112184524536},{"id":"https://openalex.org/C161191863","wikidata":"https://www.wikidata.org/wiki/Q199655","display_name":"Library science","level":1,"score":0.19478237628936768}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3451471.3451489","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3451471.3451489","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 The 4th International Conference on Software Engineering and Information Management","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.550000011920929,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W1991324888","https://openalex.org/W2009289067","https://openalex.org/W2043465293","https://openalex.org/W2058339485","https://openalex.org/W2062397881","https://openalex.org/W2101694047","https://openalex.org/W2142110756","https://openalex.org/W2513013314","https://openalex.org/W2574604241","https://openalex.org/W2600966912","https://openalex.org/W2946592782","https://openalex.org/W2946921280","https://openalex.org/W2981727907"],"related_works":["https://openalex.org/W2011236053","https://openalex.org/W2349695282","https://openalex.org/W2079670513","https://openalex.org/W2380481257","https://openalex.org/W2082220406","https://openalex.org/W2035691235","https://openalex.org/W2139226347","https://openalex.org/W2069391836","https://openalex.org/W4254603770","https://openalex.org/W2362822043"],"abstract_inverted_index":{"In":[0],"this":[1,174,182,307],"paper,":[2],"we":[3,43,176,302],"identify":[4],"the":[5,13,61,80,91,104,111,119,131,143,147,150,194,197,209,216,245,250,269,278,300],"most":[6],"accurate":[7],"method":[8,206],"of":[9,25,31,50,121,149,155,196],"clustering":[10,179,202,227,286,295],"to":[11,146,163,168,180,244,292],"deduplicate":[12,181],"past":[14,112],"centuries":[15,113],"book":[16,75,93],"records":[17,33,49,157,213,260],"from":[18,67],"multiple":[19],"libraries":[20,69],"for":[21,207],"data":[22,39,65,116,178],"analysis":[23],"out":[24],"five":[26],"common":[27],"algorithms.":[28],"The":[29,41,233,253,294],"presence":[30],"duplicate":[32,156],"is":[34,161,186,203],"a":[35,86,97,122,136,165,230,240],"major":[36],"concern":[37],"in":[38,53,60,70,89,96,106,212,249,261,306],"analysis.":[40],"dataset":[42,137,151,235,242,255,301],"studied":[44,303],"contains":[45,256],"over":[46,110],"5":[47],"million":[48],"books":[51],"published":[52],"European":[54,191],"languages":[55],"between":[56,101],"1500":[57],"and":[58,108,124,152,193,265,273],"1800":[59],"Machine-Readable":[62],"Cataloging":[63],"(MARC)":[64],"format":[66],"17,983":[68],"123":[71],"countries.":[72],"However,":[73],"each":[74,170],"record":[76,275],"was":[77,94,222,236],"archived":[78,95],"by":[79,188,215,238],"library":[81],"owning":[82],"it.":[83],"This":[84],"creates":[85],"consistency":[87,117],"problem":[88],"which":[90],"same":[92,132,246,270],"slightly":[98,127],"different":[99,128],"way":[100],"libraries.":[102],"Moreover,":[103],"change":[105],"geography":[107],"language":[109,271],"also":[114],"affects":[115],"regarding":[118],"name":[120],"person":[123],"place.":[125],"Many":[126],"names":[129],"represent":[130],"record.":[133,171],"Analyzing":[134],"such":[135],"without":[138],"proper":[139],"cleaning":[140],"will":[141],"misrepresent":[142],"result.":[144],"Due":[145],"size":[148],"unknown":[153],"number":[154],"with":[158,224,267],"variation,":[159],"it":[160],"impractical":[162],"create":[164],"lookup":[166],"table":[167],"replace":[169],"To":[172],"solve":[173],"problem,":[175],"use":[177],"dataset.":[183,232,252,280],"Our":[184,220,281],"work":[185],"informed":[187],"scholarship":[189],"on":[190,229,299],"History":[192,195],"Book.":[198],"We":[199],"find":[200],"that":[201,284],"an":[204],"effective":[205],"detecting":[208],"slight":[210],"differences":[211],"caused":[214],"above-mentioned":[217],"cataloging":[218],"inconsistencies.":[219],"foundation":[221],"experimentation":[223],"several":[225],"candidate":[226],"methods":[228],"test":[231,234],"prepared":[237],"corrupting":[239],"clean":[241,254],"according":[243],"characteristics":[247],"found":[248],"whole":[251,279],"roughly":[257],"1,000":[258],"random":[259],"English,":[262],"German,":[263],"French,":[264],"Latin":[266],"approximately":[268],"distribution":[272],"average":[274],"lengths":[276],"as":[277,304],"evaluation":[282],"reveals":[283],"some":[285],"algorithms":[287],"can":[288],"achieve":[289],"accuracy":[290],"up":[291],"0.97072.":[293],"techniques":[296],"perform":[297],"well":[298],"demonstrated":[305],"paper.":[308]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":2}],"updated_date":"2026-05-08T15:41:06.802602","created_date":"2025-10-10T00:00:00"}
