{"id":"https://openalex.org/W1985854646","doi":"https://doi.org/10.1145/1183550.1183560","title":"Efficient, automatic web resource harvesting","display_name":"Efficient, automatic web resource harvesting","publication_year":2006,"publication_date":"2006-11-10","ids":{"openalex":"https://openalex.org/W1985854646","doi":"https://doi.org/10.1145/1183550.1183560","mag":"1985854646"},"language":"en","primary_location":{"id":"doi:10.1145/1183550.1183560","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1183550.1183560","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 8th annual ACM international workshop on Web information and data management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5081811192","display_name":"Michael L. Nelson","orcid":"https://orcid.org/0000-0003-3749-8116"},"institutions":[{"id":"https://openalex.org/I81365321","display_name":"Old Dominion University","ror":"https://ror.org/04zjtrb98","country_code":"US","type":"education","lineage":["https://openalex.org/I81365321"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Michael L. Nelson","raw_affiliation_strings":["Old Dominion University, Norfolk VA","|| Old Dominion University, Norfolk, VA"],"affiliations":[{"raw_affiliation_string":"Old Dominion University, Norfolk VA","institution_ids":["https://openalex.org/I81365321"]},{"raw_affiliation_string":"|| Old Dominion University, Norfolk, VA","institution_ids":["https://openalex.org/I81365321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113726511","display_name":"Joan A. Smith","orcid":null},"institutions":[{"id":"https://openalex.org/I81365321","display_name":"Old Dominion University","ror":"https://ror.org/04zjtrb98","country_code":"US","type":"education","lineage":["https://openalex.org/I81365321"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Joan A. Smith","raw_affiliation_strings":["Old Dominion University, Norfolk VA","|| Old Dominion University, Norfolk, VA"],"affiliations":[{"raw_affiliation_string":"Old Dominion University, Norfolk VA","institution_ids":["https://openalex.org/I81365321"]},{"raw_affiliation_string":"|| Old Dominion University, Norfolk, VA","institution_ids":["https://openalex.org/I81365321"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049555048","display_name":"Ignacio Garcia del Campo","orcid":null},"institutions":[{"id":"https://openalex.org/I81365321","display_name":"Old Dominion University","ror":"https://ror.org/04zjtrb98","country_code":"US","type":"education","lineage":["https://openalex.org/I81365321"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ignacio Garcia del Campo","raw_affiliation_strings":["Old Dominion University, Norfolk VA","|| Old Dominion University, Norfolk, VA"],"affiliations":[{"raw_affiliation_string":"Old Dominion University, Norfolk VA","institution_ids":["https://openalex.org/I81365321"]},{"raw_affiliation_string":"|| Old Dominion University, Norfolk, VA","institution_ids":["https://openalex.org/I81365321"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5081811192"],"corresponding_institution_ids":["https://openalex.org/I81365321"],"apc_list":null,"apc_paid":null,"fwci":10.3277,"has_fulltext":false,"cited_by_count":21,"citation_normalized_percentile":{"value":0.97568294,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":"11","issue":null,"first_page":"43","last_page":"50"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.991599977016449,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.9704999923706055,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8863452672958374},{"id":"https://openalex.org/keywords/web-crawler","display_name":"Web crawler","score":0.7717329263687134},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.702668309211731},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.6839581727981567},{"id":"https://openalex.org/keywords/crawling","display_name":"Crawling","score":0.6763569116592407},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6104149222373962},{"id":"https://openalex.org/keywords/listing","display_name":"Listing (finance)","score":0.5199698209762573},{"id":"https://openalex.org/keywords/web-resource","display_name":"Web resource","score":0.4527641832828522},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.35461530089378357}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8863452672958374},{"id":"https://openalex.org/C13743948","wikidata":"https://www.wikidata.org/wiki/Q45842","display_name":"Web crawler","level":2,"score":0.7717329263687134},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.702668309211731},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.6839581727981567},{"id":"https://openalex.org/C100368936","wikidata":"https://www.wikidata.org/wiki/Q1411725","display_name":"Crawling","level":2,"score":0.6763569116592407},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6104149222373962},{"id":"https://openalex.org/C2779820595","wikidata":"https://www.wikidata.org/wiki/Q798505","display_name":"Listing (finance)","level":2,"score":0.5199698209762573},{"id":"https://openalex.org/C65603577","wikidata":"https://www.wikidata.org/wiki/Q3427877","display_name":"Web resource","level":2,"score":0.4527641832828522},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.35461530089378357},{"id":"https://openalex.org/C10138342","wikidata":"https://www.wikidata.org/wiki/Q43015","display_name":"Finance","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C105702510","wikidata":"https://www.wikidata.org/wiki/Q514","display_name":"Anatomy","level":1,"score":0.0},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/1183550.1183560","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1183550.1183560","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 8th annual ACM international workshop on Web information and data management","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.102.4388","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.102.4388","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://public.lanl.gov/herbertv/papers/f140-nelson.pdf","raw_type":"text"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.76.3065","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.76.3065","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www.cs.odu.edu/~mln/pubs/widm-2006/modoai-widm06.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320306146","display_name":"Andrew W. Mellon Foundation","ror":"https://ror.org/04jsh2530"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1494059689","https://openalex.org/W1651432567","https://openalex.org/W1780091961","https://openalex.org/W1782966390","https://openalex.org/W1958473611","https://openalex.org/W1976624301","https://openalex.org/W1981333922","https://openalex.org/W2001327846","https://openalex.org/W2029341294","https://openalex.org/W2033015026","https://openalex.org/W2038378248","https://openalex.org/W2039324760","https://openalex.org/W2039884530","https://openalex.org/W2044946777","https://openalex.org/W2055883318","https://openalex.org/W2059784683","https://openalex.org/W2070573790","https://openalex.org/W2080676333","https://openalex.org/W2080963341","https://openalex.org/W2081948558","https://openalex.org/W2082589987","https://openalex.org/W2094930182","https://openalex.org/W2097359597","https://openalex.org/W2101227978","https://openalex.org/W2105533042","https://openalex.org/W2114849539","https://openalex.org/W2119088569","https://openalex.org/W2125969310","https://openalex.org/W2130610812","https://openalex.org/W2147533843","https://openalex.org/W2148177897","https://openalex.org/W2157748587","https://openalex.org/W2170188121","https://openalex.org/W2221553715","https://openalex.org/W2913068797","https://openalex.org/W4361024420","https://openalex.org/W6665504316"],"related_works":["https://openalex.org/W3119324922","https://openalex.org/W2352686120","https://openalex.org/W2372594123","https://openalex.org/W2358310581","https://openalex.org/W2964752624","https://openalex.org/W2026132847","https://openalex.org/W4385695127","https://openalex.org/W2137810919","https://openalex.org/W2089702591","https://openalex.org/W4255854114"],"abstract_inverted_index":{"There":[0],"are":[1,37],"two":[2,54],"problems":[3,55],"associated":[4],"with":[5,147],"conventional":[6],"web":[7,20,79,104,144],"crawling":[8],"techniques:":[9],"a":[10,18,103],"crawler":[11],"cannot":[12],"know":[13],"if":[14],"all":[15,99],"resources":[16,36,145],"at":[17,102],"non-trivial":[19],"site":[21],"have":[22],"been":[23],"discovered":[24],"and":[25,30,70,106,110,123],"crawled":[26],"(\"the":[27,44],"counting":[28,95],"problem\")":[29],"the":[31,35,61,78,84,94,135],"human-readable":[32],"format":[33],"of":[34,143],"not":[38],"always":[39],"suitable":[40],"for":[41,59,66,120],"machine":[42],"processing":[43],"representation":[45,136],"problem\").":[46],"We":[47,82],"introduce":[48],"an":[49],"approach":[50],"that":[51],"solves":[52],"these":[53],"by":[56,97,138],"implementing":[57],"support":[58],"both":[60],"Open":[62],"Archives":[63],"Initiative":[64],"Protocol":[65],"Metadata":[67],"Harvesting":[68],"(OAI-PMH)":[69],"MPEG-21":[71,153],"Digital":[72],"Item":[73],"Declaration":[74],"Language":[75],"(DIDL)":[76],"into":[77],"server":[80,105],"itself.":[81],"present":[83],"Apache":[85],"module":[86],"\"mod_oai\",":[87],"which":[88],"can":[89],"be":[90,131],"used":[91,132],"to":[92,133],"address":[93,134],"problem":[96,137],"listing":[98],"valid":[100],"URIs":[101],"efficiently":[107],"discovering":[108],"updates":[109],"additions":[111],"on":[112],"subsequent":[113],"crawls.":[114],"Our":[115],"experiments":[116],"indicated":[117],"comparable":[118],"performance":[119],"initial":[121],"crawls,":[122],"dramatic":[124],"increases":[125],"in":[126,152],"update":[127],"speed":[128],"mod_oaican":[129],"also":[130],"providing":[139],"\"preservation":[140],"ready\"":[141],"versions":[142],"aggregated":[146],"their":[148],"respective":[149],"forensic":[150],"metadata":[151],"DIDL":[154],"format.":[155]},"counts_by_year":[{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":1},{"year":2015,"cited_by_count":1},{"year":2012,"cited_by_count":2}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
