{"id":"https://openalex.org/W2153180170","doi":"https://doi.org/10.1145/1014052.1014127","title":"Improved robustness of signature-based near-replica detection via lexicon randomization","display_name":"Improved robustness of signature-based near-replica detection via lexicon randomization","publication_year":2004,"publication_date":"2004-08-22","ids":{"openalex":"https://openalex.org/W2153180170","doi":"https://doi.org/10.1145/1014052.1014127","mag":"2153180170"},"language":"en","primary_location":{"id":"doi:10.1145/1014052.1014127","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1014052.1014127","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111761879","display_name":"Aleksander Ko\u0142cz","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Aleksander Ko\u0142cz","raw_affiliation_strings":["AOL, Inc., Dulles, VA"],"affiliations":[{"raw_affiliation_string":"AOL, Inc., Dulles, VA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103951527","display_name":"Abdur Chowdhury","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abdur Chowdhury","raw_affiliation_strings":["AOL, Inc., Dulles, VA"],"affiliations":[{"raw_affiliation_string":"AOL, Inc., Dulles, VA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5110113289","display_name":"J. Alspector","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Joshua Alspector","raw_affiliation_strings":["AOL, Inc., Dulles, VA"],"affiliations":[{"raw_affiliation_string":"AOL, Inc., Dulles, VA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5111761879"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.5043,"has_fulltext":false,"cited_by_count":71,"citation_normalized_percentile":{"value":0.94463558,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"605","last_page":"610"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8186094760894775},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7420312166213989},{"id":"https://openalex.org/keywords/replica","display_name":"Replica","score":0.6145340800285339},{"id":"https://openalex.org/keywords/lexicon","display_name":"Lexicon","score":0.5343915820121765},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5215861201286316},{"id":"https://openalex.org/keywords/offset","display_name":"Offset (computer science)","score":0.4277162551879883},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.41339805722236633},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.39857399463653564},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3857155740261078},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.2509160041809082}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8186094760894775},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7420312166213989},{"id":"https://openalex.org/C2775937380","wikidata":"https://www.wikidata.org/wiki/Q1232589","display_name":"Replica","level":2,"score":0.6145340800285339},{"id":"https://openalex.org/C2778121359","wikidata":"https://www.wikidata.org/wiki/Q8096","display_name":"Lexicon","level":2,"score":0.5343915820121765},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5215861201286316},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.4277162551879883},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.41339805722236633},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39857399463653564},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3857155740261078},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2509160041809082},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/1014052.1014127","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1014052.1014127","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.134.4842","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.134.4842","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://ir.iit.edu/~abdur/publications/470-kolcz.pdf","raw_type":"text"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.68.606","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.68.606","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://ir.iit.edu/~alek/470-kolcz2.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W192724328","https://openalex.org/W1481258775","https://openalex.org/W1502916507","https://openalex.org/W1509991416","https://openalex.org/W1515671861","https://openalex.org/W1536700701","https://openalex.org/W1941707302","https://openalex.org/W1995173295","https://openalex.org/W2007842132","https://openalex.org/W2048791264","https://openalex.org/W2067432306","https://openalex.org/W2074297512","https://openalex.org/W2111549955","https://openalex.org/W2118131693","https://openalex.org/W2132069633","https://openalex.org/W2132072504","https://openalex.org/W2139532006","https://openalex.org/W2152565070","https://openalex.org/W2164634022","https://openalex.org/W4285719527","https://openalex.org/W6607776381","https://openalex.org/W6629956336","https://openalex.org/W6630516739"],"related_works":["https://openalex.org/W3013979739","https://openalex.org/W2655578171","https://openalex.org/W2577913821","https://openalex.org/W2460131733","https://openalex.org/W2953070151","https://openalex.org/W4296976839","https://openalex.org/W2372946558","https://openalex.org/W3126044086","https://openalex.org/W4394646256","https://openalex.org/W4236952075"],"abstract_inverted_index":{"Detection":[0],"of":[1,22,93],"near":[2],"duplicate":[3,25],"documents":[4],"is":[5,112],"an":[6],"important":[7],"problem":[8],"in":[9,123,134,143],"many":[10],"data":[11],"mining":[12],"and":[13,46,88,106],"information":[14],"filtering":[15],"applications.":[16],"When":[17],"faced":[18],"with":[19,67,103,119],"massive":[20],"quantities":[21],"data,":[23],"traditional":[24,117],"detection":[26,81,135],"techniques":[27],"relying":[28],"on":[29,77],"direct":[30],"inter-document":[31],"similarity":[32],"computation":[33],"(e.g.,":[34],"using":[35],"the":[36,44,51,109,120],"cosine":[37],"measure)":[38],"are":[39,59,83,137],"often":[40],"not":[41],"feasible":[42],"given":[43],"time":[45],"memory":[47],"performance":[48],"constraints.":[49],"On":[50],"other":[52],"hand,":[53],"fingerprint-based":[54],"methods,":[55],"such":[56],"as":[57,127,129],"I-Match,":[58,118],"very":[60],"attractive":[61],"computationally":[62],"but":[63],"may":[64],"be":[65],"brittle":[66],"respect":[68],"to":[69,72,79,114],"small":[70,141],"changes":[71],"document":[73],"content.":[74],"We":[75],"focus":[76],"approaches":[78],"near-replica":[80],"that":[82],"based":[84],"upon":[85],"large-collection":[86],"statistics":[87],"present":[89],"a":[90],"general":[91],"technique":[92],"increasing":[94],"their":[95],"robustness":[96],"via":[97],"multiple":[98],"lexicon":[99],"randomization.":[100],"In":[101],"experiments":[102],"large":[104,132],"web-page":[105],"spam-email":[107],"datasets":[108],"proposed":[110],"method":[111],"shown":[113],"consistently":[115],"outperform":[116],"relative":[121],"improvement":[122],"duplicate-document":[124],"recall":[125],"reaching":[126],"high":[128],"40-60%.":[130],"The":[131],"gains":[133],"accuracy":[136],"offset":[138],"by":[139],"only":[140],"increases":[142],"computational":[144],"requirements.":[145]},"counts_by_year":[{"year":2023,"cited_by_count":1},{"year":2020,"cited_by_count":2},{"year":2019,"cited_by_count":6},{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":2},{"year":2016,"cited_by_count":1},{"year":2015,"cited_by_count":2},{"year":2014,"cited_by_count":5},{"year":2013,"cited_by_count":7},{"year":2012,"cited_by_count":8}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
