{"id":"https://openalex.org/W2053107662","doi":"https://doi.org/10.1145/2063576.2063647","title":"Partial duplicate detection for large book collections","display_name":"Partial duplicate detection for large book collections","publication_year":2011,"publication_date":"2011-10-24","ids":{"openalex":"https://openalex.org/W2053107662","doi":"https://doi.org/10.1145/2063576.2063647","mag":"2053107662"},"language":"en","primary_location":{"id":"doi:10.1145/2063576.2063647","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2063576.2063647","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 20th ACM international conference on Information and knowledge management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5078197885","display_name":"\u0130smet Zeki Yaln\u0131z","orcid":null},"institutions":[{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ismet Zeki Yalniz","raw_affiliation_strings":["University of Massachusetts-Amherst, Amherst, MA, USA","University Of Massachusetts Amherst, Amherst, MA, USA#TAB#"],"affiliations":[{"raw_affiliation_string":"University of Massachusetts-Amherst, Amherst, MA, USA","institution_ids":["https://openalex.org/I24603500"]},{"raw_affiliation_string":"University Of Massachusetts Amherst, Amherst, MA, USA#TAB#","institution_ids":["https://openalex.org/I24603500"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003287631","display_name":"Ethem F. Can","orcid":null},"institutions":[{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ethem F. Can","raw_affiliation_strings":["University of Massachusetts-Amherst, Amherst, MA, USA","University Of Massachusetts Amherst, Amherst, MA, USA#TAB#"],"affiliations":[{"raw_affiliation_string":"University of Massachusetts-Amherst, Amherst, MA, USA","institution_ids":["https://openalex.org/I24603500"]},{"raw_affiliation_string":"University Of Massachusetts Amherst, Amherst, MA, USA#TAB#","institution_ids":["https://openalex.org/I24603500"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055459215","display_name":"R. Manmatha","orcid":"https://orcid.org/0000-0003-2315-8583"},"institutions":[{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"R. Manmatha","raw_affiliation_strings":["University of Massachusetts-Amherst, Amherst, MA, USA","University Of Massachusetts Amherst, Amherst, MA, USA#TAB#"],"affiliations":[{"raw_affiliation_string":"University of Massachusetts-Amherst, Amherst, MA, USA","institution_ids":["https://openalex.org/I24603500"]},{"raw_affiliation_string":"University Of Massachusetts Amherst, Amherst, MA, USA#TAB#","institution_ids":["https://openalex.org/I24603500"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5078197885"],"corresponding_institution_ids":["https://openalex.org/I24603500"],"apc_list":null,"apc_paid":null,"fwci":3.4007,"has_fulltext":false,"cited_by_count":26,"citation_normalized_percentile":{"value":0.93736341,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"469","last_page":"474"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12357","display_name":"Digital Media Forensic Detection","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9782999753952026,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7808430194854736},{"id":"https://openalex.org/keywords/subsequence","display_name":"Subsequence","score":0.6403782367706299},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6309192180633545},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6102685928344727},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5892294049263},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.5525583028793335},{"id":"https://openalex.org/keywords/precision-and-recall","display_name":"Precision and recall","score":0.5214490294456482},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5209011435508728},{"id":"https://openalex.org/keywords/recall","display_name":"Recall","score":0.501305103302002},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4929063320159912},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.48956069350242615},{"id":"https://openalex.org/keywords/character","display_name":"Character (mathematics)","score":0.4189334511756897},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.13597577810287476},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.12511807680130005},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.10294947028160095},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.07758322358131409}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7808430194854736},{"id":"https://openalex.org/C137877099","wikidata":"https://www.wikidata.org/wiki/Q1332977","display_name":"Subsequence","level":3,"score":0.6403782367706299},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6309192180633545},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6102685928344727},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5892294049263},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.5525583028793335},{"id":"https://openalex.org/C81669768","wikidata":"https://www.wikidata.org/wiki/Q2359161","display_name":"Precision and recall","level":2,"score":0.5214490294456482},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5209011435508728},{"id":"https://openalex.org/C100660578","wikidata":"https://www.wikidata.org/wiki/Q18733","display_name":"Recall","level":2,"score":0.501305103302002},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4929063320159912},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.48956069350242615},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.4189334511756897},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.13597577810287476},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.12511807680130005},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.10294947028160095},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.07758322358131409},{"id":"https://openalex.org/C34388435","wikidata":"https://www.wikidata.org/wiki/Q2267362","display_name":"Bounded function","level":2,"score":0.0},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1145/2063576.2063647","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2063576.2063647","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 20th ACM international conference on Information and knowledge management","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.366.6942","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.366.6942","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://ciir-publications.cs.umass.edu/pub/web/getpdf.php?id=970","raw_type":"text"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.469.1428","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.469.1428","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://people.cs.umass.edu/~zeki/pubs/yalniz_et_al_CIKM_11.pdf","raw_type":"text"},{"id":"pmh:oai:works.bepress.com:r_manmatha-1081","is_oa":false,"landing_page_url":"https://works.bepress.com/r_manmatha/41","pdf_url":null,"source":{"id":"https://openalex.org/S4306402240","display_name":"ScholarWorks@UMassAmherst (University of Massachusetts Amherst)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I24603500","host_organization_name":"University of Massachusetts Amherst","host_organization_lineage":["https://openalex.org/I24603500"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"R. Manmatha","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.7699999809265137,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1509213251","https://openalex.org/W1536700701","https://openalex.org/W1596691921","https://openalex.org/W1609518033","https://openalex.org/W1647729745","https://openalex.org/W1971282369","https://openalex.org/W1998161216","https://openalex.org/W2005898500","https://openalex.org/W2007842132","https://openalex.org/W2012833704","https://openalex.org/W2027503915","https://openalex.org/W2027506564","https://openalex.org/W2067432306","https://openalex.org/W2068374154","https://openalex.org/W2085922539","https://openalex.org/W2110166424","https://openalex.org/W2111295912","https://openalex.org/W2127363101","https://openalex.org/W2129070169","https://openalex.org/W2148578434","https://openalex.org/W2149182189","https://openalex.org/W2149551320","https://openalex.org/W2152565070","https://openalex.org/W2158331042","https://openalex.org/W2164634022","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W2132372003","https://openalex.org/W83344948","https://openalex.org/W2042129940","https://openalex.org/W2536723159","https://openalex.org/W2041190900","https://openalex.org/W2001121861","https://openalex.org/W2897178939","https://openalex.org/W2886703719","https://openalex.org/W2140538336","https://openalex.org/W2282329238"],"abstract_inverted_index":{"A":[0],"framework":[1],"is":[2,25,84,129,142,187],"presented":[3],"for":[4,135,189],"discovering":[5],"partial":[6,154],"duplicates":[7,155],"in":[8,22,37,44,65,95,156],"large":[9],"collections":[10],"of":[11,30,61,76,87,93,102,147],"scanned":[12,149],"books":[13,107,119,151],"with":[14,70,172],"optical":[15],"character":[16],"recognition":[17],"(OCR)":[18],"errors.":[19],"Each":[20],"book":[21],"the":[23,28,33,38,45,63,71,74,88,91,96,100,109],"collection":[24,146],"represented":[26],"by":[27],"sequence":[29,101],"words":[31,48,64,78,104],"(in":[32],"order":[34,72],"they":[35,56],"appear":[36,41],"text)":[39],"which":[40,83],"only":[42],"once":[43],"book.":[46,68,97],"These":[47],"are":[49,120],"referred":[50],"to":[51,170],"as":[52,139,184],"\"unique":[53],"words\"":[54],"and":[55,90,141,162,166,175,186],"constitute":[57],"a":[58,66,80,145,190],"small":[59],"percentage":[60],"all":[62],"typical":[67],"Along":[69],"information":[73],"set":[75],"unique":[77,103],"provides":[79],"compact":[81],"representation":[82],"highly":[85],"descriptive":[86],"content":[89],"flow":[92],"ideas":[94],"By":[98],"aligning":[99],"from":[105],"two":[106,118],"using":[108,159],"longest":[110],"common":[111],"subsequence":[112],"(LCS)":[113],"one":[114],"can":[115],"discover":[116],"whether":[117],"duplicates.":[121],"Experiments":[122],"on":[123,181],"several":[124],"datasets":[125],"show":[126],"that":[127],"DUPNIQ":[128,152],"more":[130],"accurate":[131],"than":[132],"traditional":[133],"methods":[134],"duplicate":[136],"detection":[137],"such":[138],"shingling":[140,171],"fast.":[143],"On":[144],"100K":[148],"English":[150],"detects":[153],"30":[157],"min":[158],"350":[160],"cores":[161],"has":[163],"precision":[164,173],"0.996":[165],"recall":[167,176],"0.833":[168],"compared":[169],"0.992":[174],"0.720.":[177],"The":[178],"technique":[179],"works":[180],"other":[182],"languages":[183],"well":[185],"demonstrated":[188],"French":[191],"dataset.":[192]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2021,"cited_by_count":2},{"year":2019,"cited_by_count":2},{"year":2018,"cited_by_count":2},{"year":2017,"cited_by_count":2},{"year":2015,"cited_by_count":4},{"year":2014,"cited_by_count":6},{"year":2013,"cited_by_count":3},{"year":2012,"cited_by_count":1}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
