{"id":"https://openalex.org/W2013761541","doi":"https://doi.org/10.1145/1462148.1462151","title":"Do not crawl in the DUST","display_name":"Do not crawl in the DUST","publication_year":2009,"publication_date":"2009-01-01","ids":{"openalex":"https://openalex.org/W2013761541","doi":"https://doi.org/10.1145/1462148.1462151","mag":"2013761541"},"language":"en","primary_location":{"id":"doi:10.1145/1462148.1462151","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1462148.1462151","pdf_url":null,"source":{"id":"https://openalex.org/S131231701","display_name":"ACM Transactions on the Web","issn_l":"1559-1131","issn":["1559-1131","1559-114X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on the Web","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054834255","display_name":"Ziv Bar-Yossef","orcid":null},"institutions":[{"id":"https://openalex.org/I174306211","display_name":"Technion \u2013 Israel Institute of Technology","ror":"https://ror.org/03qryx823","country_code":"IL","type":"education","lineage":["https://openalex.org/I174306211"]}],"countries":["IL"],"is_corresponding":true,"raw_author_name":"Ziv Bar-Yossef","raw_affiliation_strings":["Technion Israel Institute of Technology, Haifa, Israel"],"affiliations":[{"raw_affiliation_string":"Technion Israel Institute of Technology, Haifa, Israel","institution_ids":["https://openalex.org/I174306211"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059238786","display_name":"Idit Keidar","orcid":"https://orcid.org/0000-0002-6417-1250"},"institutions":[{"id":"https://openalex.org/I174306211","display_name":"Technion \u2013 Israel Institute of Technology","ror":"https://ror.org/03qryx823","country_code":"IL","type":"education","lineage":["https://openalex.org/I174306211"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Idit Keidar","raw_affiliation_strings":["Technion Israel Institute of Technology, Haifa, Israel"],"affiliations":[{"raw_affiliation_string":"Technion Israel Institute of Technology, Haifa, Israel","institution_ids":["https://openalex.org/I174306211"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5052102699","display_name":"Uri Schonfeld","orcid":null},"institutions":[{"id":"https://openalex.org/I161318765","display_name":"University of California, Los Angeles","ror":"https://ror.org/046rm7j60","country_code":"US","type":"education","lineage":["https://openalex.org/I161318765"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Uri Schonfeld","raw_affiliation_strings":["University of California Los Angeles, CA","University of California, Los Angeles. CA"],"affiliations":[{"raw_affiliation_string":"University of California Los Angeles, CA","institution_ids":["https://openalex.org/I161318765"]},{"raw_affiliation_string":"University of California, Los Angeles. CA","institution_ids":["https://openalex.org/I161318765"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5054834255"],"corresponding_institution_ids":["https://openalex.org/I174306211"],"apc_list":null,"apc_paid":null,"fwci":3.7015,"has_fulltext":false,"cited_by_count":36,"citation_normalized_percentile":{"value":0.93943458,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":93,"max":98},"biblio":{"volume":"3","issue":"1","first_page":"1","last_page":"31"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10064","display_name":"Complex Network Analysis Techniques","score":0.9908000230789185,"subfield":{"id":"https://openalex.org/subfields/3109","display_name":"Statistical and Nonlinear Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/crawling","display_name":"Crawling","score":0.8373327255249023},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7806118726730347},{"id":"https://openalex.org/keywords/web-crawler","display_name":"Web crawler","score":0.7514499425888062},{"id":"https://openalex.org/keywords/pagerank","display_name":"PageRank","score":0.7387464046478271},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.6526644229888916},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.5676099061965942},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.539401650428772},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5361551642417908},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5298687815666199},{"id":"https://openalex.org/keywords/popularity","display_name":"Popularity","score":0.5085400342941284},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.47980859875679016},{"id":"https://openalex.org/keywords/search-engine","display_name":"Search engine","score":0.47245195508003235},{"id":"https://openalex.org/keywords/web-server","display_name":"Web server","score":0.4670623242855072},{"id":"https://openalex.org/keywords/web-search-engine","display_name":"Web search engine","score":0.41614794731140137},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.3762730360031128},{"id":"https://openalex.org/keywords/the-internet","display_name":"The Internet","score":0.21071270108222961},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.1439831256866455},{"id":"https://openalex.org/keywords/web-navigation","display_name":"Web navigation","score":0.14113396406173706},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.12402895092964172}],"concepts":[{"id":"https://openalex.org/C100368936","wikidata":"https://www.wikidata.org/wiki/Q1411725","display_name":"Crawling","level":2,"score":0.8373327255249023},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7806118726730347},{"id":"https://openalex.org/C13743948","wikidata":"https://www.wikidata.org/wiki/Q45842","display_name":"Web crawler","level":2,"score":0.7514499425888062},{"id":"https://openalex.org/C2779172887","wikidata":"https://www.wikidata.org/wiki/Q184316","display_name":"PageRank","level":2,"score":0.7387464046478271},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.6526644229888916},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.5676099061965942},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.539401650428772},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5361551642417908},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5298687815666199},{"id":"https://openalex.org/C2780586970","wikidata":"https://www.wikidata.org/wiki/Q1357284","display_name":"Popularity","level":2,"score":0.5085400342941284},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.47980859875679016},{"id":"https://openalex.org/C97854310","wikidata":"https://www.wikidata.org/wiki/Q19541","display_name":"Search engine","level":2,"score":0.47245195508003235},{"id":"https://openalex.org/C11392498","wikidata":"https://www.wikidata.org/wiki/Q11288","display_name":"Web server","level":3,"score":0.4670623242855072},{"id":"https://openalex.org/C521815418","wikidata":"https://www.wikidata.org/wiki/Q4182287","display_name":"Web search engine","level":4,"score":0.41614794731140137},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.3762730360031128},{"id":"https://openalex.org/C110875604","wikidata":"https://www.wikidata.org/wiki/Q75","display_name":"The Internet","level":2,"score":0.21071270108222961},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.1439831256866455},{"id":"https://openalex.org/C61096286","wikidata":"https://www.wikidata.org/wiki/Q7978592","display_name":"Web navigation","level":3,"score":0.14113396406173706},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.12402895092964172},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.0},{"id":"https://openalex.org/C105702510","wikidata":"https://www.wikidata.org/wiki/Q514","display_name":"Anatomy","level":1,"score":0.0},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.0},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.0},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/1462148.1462151","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1462148.1462151","pdf_url":null,"source":{"id":"https://openalex.org/S131231701","display_name":"ACM Transactions on the Web","issn_l":"1559-1131","issn":["1559-1131","1559-114X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on the Web","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W272982077","https://openalex.org/W938539187","https://openalex.org/W1484413656","https://openalex.org/W1506285740","https://openalex.org/W1536700701","https://openalex.org/W1546842576","https://openalex.org/W1557168499","https://openalex.org/W1597291463","https://openalex.org/W1965657003","https://openalex.org/W1990061958","https://openalex.org/W1991548210","https://openalex.org/W1997438973","https://openalex.org/W2007842132","https://openalex.org/W2011039300","https://openalex.org/W2046166587","https://openalex.org/W2087303323","https://openalex.org/W2136774890","https://openalex.org/W2141359307","https://openalex.org/W2152505903","https://openalex.org/W2152565070","https://openalex.org/W2161532103","https://openalex.org/W2164634022","https://openalex.org/W2169189540","https://openalex.org/W2171500543","https://openalex.org/W2187227553","https://openalex.org/W2401330804","https://openalex.org/W2737168042","https://openalex.org/W3138771162","https://openalex.org/W4250366158"],"related_works":["https://openalex.org/W2099585315","https://openalex.org/W2783231802","https://openalex.org/W2898209107","https://openalex.org/W2772576376","https://openalex.org/W1598793433","https://openalex.org/W2297488618","https://openalex.org/W2490951471","https://openalex.org/W2378926581","https://openalex.org/W2556247455","https://openalex.org/W2013761541"],"abstract_inverted_index":{"We":[0,39],"consider":[1],"the":[2,31,105,114],"problem":[3],"of":[4,107,116],"DUST:":[5],"Different":[6],"URLs":[7,13],"with":[8],"Similar":[9],"Text.":[10],"Such":[11],"duplicate":[12],"are":[14,62],"prevalent":[15],"in":[16],"Web":[17,20,77,93],"sites,":[18],"as":[19,120],"server":[21,78],"software":[22],"often":[23],"uses":[24],"aliases":[25],"and":[26,28,112],"redirections,":[27],"dynamically":[29],"generates":[30],"same":[32],"page":[33,82],"from":[34,72,99],"various":[35],"different":[36],"URL":[37,58],"requests.":[38],"present":[40],"a":[41,56],"novel":[42],"algorithm,":[43],"DustBuster":[44,68],",":[45],"for":[46,51],"uncovering":[47],"DUST;":[48],"that":[49,54,61],"is,":[50],"discovering":[52],"rules":[53,86],"transform":[55],"given":[57],"to":[59,64,103],"others":[60],"likely":[63],"have":[65],"similar":[66],"content.":[67],"mines":[69],"DUST":[70,102],"effectively":[71],"previous":[73],"crawl":[74],"logs":[75],"or":[76],"logs,":[79],"without":[80],"/examining":[81],"contents.":[83],"Verifying":[84],"these":[85],"via":[87],"sampling":[88],"requires":[89],"fetching":[90],"few":[91],"actual":[92],"pages.":[94],"Search":[95],"engines":[96],"can":[97],"benefit":[98],"information":[100],"about":[101],"increase":[104],"effectiveness":[106],"crawling,":[108],"reduce":[109],"indexing":[110],"overhead,":[111],"improve":[113],"quality":[115],"popularity":[117],"statistics":[118],"such":[119],"PageRank.":[121]},"counts_by_year":[{"year":2021,"cited_by_count":2},{"year":2020,"cited_by_count":3},{"year":2019,"cited_by_count":3},{"year":2018,"cited_by_count":4},{"year":2017,"cited_by_count":4},{"year":2016,"cited_by_count":5},{"year":2015,"cited_by_count":5},{"year":2014,"cited_by_count":2},{"year":2013,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
