{"id":"https://openalex.org/W7106250948","doi":"https://doi.org/10.1145/3730567.3764471","title":"Scrapers Selectively Respect robots.txt Directives: Evidence From a Large-Scale Empirical Study","display_name":"Scrapers Selectively Respect robots.txt Directives: Evidence From a Large-Scale Empirical Study","publication_year":2025,"publication_date":"2025-10-28","ids":{"openalex":"https://openalex.org/W7106250948","doi":"https://doi.org/10.1145/3730567.3764471"},"language":null,"primary_location":{"id":"doi:10.1145/3730567.3764471","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3730567.3764471","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Internet Measurement Conference","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3730567.3764471","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Taein Kim","orcid":"https://orcid.org/0009-0000-3502-3331"},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Taein Kim","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Duke University, Durham, NC, USA"],"raw_orcid":"https://orcid.org/0009-0000-3502-3331","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Duke University, Durham, NC, USA","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Karstan Bock","orcid":"https://orcid.org/0009-0004-2613-4514"},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Karstan Bock","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Duke University, Durham, NC, USA"],"raw_orcid":"https://orcid.org/0009-0004-2613-4514","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Duke University, Durham, NC, USA","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Claire Luo","orcid":"https://orcid.org/0009-0001-1734-8017"},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Claire Luo","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Duke University, Durham, NC, USA"],"raw_orcid":"https://orcid.org/0009-0001-1734-8017","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Duke University, Durham, NC, USA","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Amanda Liswood","orcid":"https://orcid.org/0009-0003-8149-8962"},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Amanda Liswood","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Duke University, Durham, NC, USA"],"raw_orcid":"https://orcid.org/0009-0003-8149-8962","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Duke University, Durham, NC, USA","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Chloe Poroslay","orcid":"https://orcid.org/0009-0006-7303-8888"},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]},{"id":"https://openalex.org/I2802377633","display_name":"Office of the National Coordinator for Health Information Technology","ror":"https://ror.org/02fm3sv87","country_code":"US","type":"government","lineage":["https://openalex.org/I1299022934","https://openalex.org/I2802377633"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chloe Poroslay","raw_affiliation_strings":["Office of Information Technology, Duke University, Durham, NC, USA"],"raw_orcid":"https://orcid.org/0009-0006-7303-8888","affiliations":[{"raw_affiliation_string":"Office of Information Technology, Duke University, Durham, NC, USA","institution_ids":["https://openalex.org/I2802377633","https://openalex.org/I170897317"]}]},{"author_position":"last","author":{"id":null,"display_name":"Emily Wenger","orcid":"https://orcid.org/0009-0006-3346-8226"},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Emily Wenger","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Duke University, Durham, NC, USA"],"raw_orcid":"https://orcid.org/0009-0006-3346-8226","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Duke University, Durham, NC, USA","institution_ids":["https://openalex.org/I170897317"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I170897317"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.67547805,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"541","last_page":"557"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.274399995803833,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.274399995803833,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.273499995470047,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12519","display_name":"Cybercrime and Law Enforcement Studies","score":0.06109999865293503,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scraper-site","display_name":"Scraper site","score":0.9519000053405762},{"id":"https://openalex.org/keywords/empirical-research","display_name":"Empirical research","score":0.6093999743461609},{"id":"https://openalex.org/keywords/protocol","display_name":"Protocol (science)","score":0.4117000102996826},{"id":"https://openalex.org/keywords/empirical-evidence","display_name":"Empirical evidence","score":0.38519999384880066},{"id":"https://openalex.org/keywords/compliance","display_name":"Compliance (psychology)","score":0.3783999979496002},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.3774999976158142}],"concepts":[{"id":"https://openalex.org/C78500136","wikidata":"https://www.wikidata.org/wiki/Q477840","display_name":"Scraper site","level":2,"score":0.9519000053405762},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.6093999743461609},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5285000205039978},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.4875999987125397},{"id":"https://openalex.org/C2780385302","wikidata":"https://www.wikidata.org/wiki/Q367158","display_name":"Protocol (science)","level":3,"score":0.4117000102996826},{"id":"https://openalex.org/C108827166","wikidata":"https://www.wikidata.org/wiki/Q175975","display_name":"Internet privacy","level":1,"score":0.4018000066280365},{"id":"https://openalex.org/C166052673","wikidata":"https://www.wikidata.org/wiki/Q83021","display_name":"Empirical evidence","level":2,"score":0.38519999384880066},{"id":"https://openalex.org/C2781460075","wikidata":"https://www.wikidata.org/wiki/Q1399332","display_name":"Compliance (psychology)","level":2,"score":0.3783999979496002},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.3774999976158142},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.3411000072956085},{"id":"https://openalex.org/C551386961","wikidata":"https://www.wikidata.org/wiki/Q22666","display_name":"File sharing","level":3,"score":0.28119999170303345},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.28040000796318054},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2799000144004822},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.2741999924182892},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.2685999870300293},{"id":"https://openalex.org/C110875604","wikidata":"https://www.wikidata.org/wiki/Q75","display_name":"The Internet","level":2,"score":0.267300009727478}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3730567.3764471","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3730567.3764471","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Internet Measurement Conference","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3730567.3764471","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3730567.3764471","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Internet Measurement Conference","raw_type":"proceedings-article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.8162975907325745,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":1,"referenced_works":["https://openalex.org/W2463059953"],"related_works":[],"abstract_inverted_index":{"Online":[0],"data":[1],"scraping":[2,166],"has":[3],"taken":[4],"on":[5,161],"new":[6,18],"dimensions":[7],"in":[8],"recent":[9],"years,":[10],"as":[11],"traditional":[12],"scrapers":[13],"have":[14],"been":[15],"joined":[16],"by":[17],"AI-specific":[19],"bots.":[20],"To":[21,77],"counteract":[22],"unwanted":[23,165],"scraping,":[24],"many":[25,115],"sites":[26],"use":[27],"tools":[28],"like":[29],"the":[30,41,49,52,79,84,88,108,171],"Robots":[31],"Exclusion":[32],"Protocol":[33],"(REP),":[34],"which":[35],"places":[36],"a":[37,122],"robots.txt":[38,97,126,139,153,162],"file":[39],"at":[40,154],"site":[42],"root":[43],"to":[44,71,135,163],"dictate":[45],"scraper":[46,94],"behavior.":[47],"Yet,":[48],"efficacy":[50],"of":[51,83,92,110,124,145],"REP":[53],"is":[54,167],"not":[55],"well-understood.":[56],"Anecdotal":[57],"evidence":[58],"suggests":[59],"some":[60],"bots":[61,113,131],"comply":[62,136],"poorly":[63],"with":[64,96,137],"it,":[65],"but":[66],"no":[67],"rigorous":[68],"study":[69,91],"exists":[70],"support":[72],"(or":[73],"refute)":[74],"this":[75],"claim.":[76],"understand":[78],"merits":[80],"and":[81,141,169],"limits":[82],"REP,":[85],"we":[86],"conduct":[87],"first":[89],"large-scale":[90],"web":[93,101],"compliance":[95],"directives":[98],"using":[99,121],"anonymized":[100],"logs":[102],"from":[103],"our":[104],"institution.":[105],"We":[106,128],"analyze":[107],"behavior":[109],"130":[111],"self-declared":[112],"(and":[114],"anonymous":[116],"ones)":[117],"over":[118],"40":[119],"days,":[120],"series":[123],"controlled":[125],"experiments.":[127],"find":[129],"that":[130,142,159],"are":[132],"less":[133],"likely":[134],"stricter":[138],"directives,":[140],"certain":[143],"categories":[144],"bots,":[146],"including":[147],"AI":[148],"search":[149],"crawlers,":[150],"rarely":[151],"check":[152],"all.":[155],"Our":[156],"findings":[157],"suggest":[158],"relying":[160],"prevent":[164],"risky":[168],"highlight":[170],"need":[172],"for":[173],"alternatives.":[174]},"counts_by_year":[],"updated_date":"2025-11-23T05:13:22.807545","created_date":"2025-11-23T00:00:00"}
