{"id":"https://openalex.org/W4237528028","doi":"https://doi.org/10.1109/jcdl.2014.6970188","title":"Finding pages on the unarchived Web","display_name":"Finding pages on the unarchived Web","publication_year":2014,"publication_date":"2014-09-01","ids":{"openalex":"https://openalex.org/W4237528028","doi":"https://doi.org/10.1109/jcdl.2014.6970188"},"language":"en","primary_location":{"id":"doi:10.1109/jcdl.2014.6970188","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jcdl.2014.6970188","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Joint Conference on Digital Libraries","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://ir.cwi.nl/pub/23336/23336A.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054097060","display_name":"Hugo Huurdeman","orcid":"https://orcid.org/0000-0002-3027-9597"},"institutions":[{"id":"https://openalex.org/I887064364","display_name":"University of Amsterdam","ror":"https://ror.org/04dkp9463","country_code":"NL","type":"education","lineage":["https://openalex.org/I887064364"]},{"id":"https://openalex.org/I865915315","display_name":"Vrije Universiteit Amsterdam","ror":"https://ror.org/008xxew50","country_code":"NL","type":"education","lineage":["https://openalex.org/I865915315"]}],"countries":["NL"],"is_corresponding":true,"raw_author_name":"Hugo C. Huurdeman","raw_affiliation_strings":["Universiteit van Amsterdam, Amsterdam, Noord-Holland, NL"],"affiliations":[{"raw_affiliation_string":"Universiteit van Amsterdam, Amsterdam, Noord-Holland, NL","institution_ids":["https://openalex.org/I865915315","https://openalex.org/I887064364"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058124570","display_name":"Anat Ben-David","orcid":"https://orcid.org/0000-0003-4510-5634"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Anat Ben-David","raw_affiliation_strings":["Universlty of Amsterdam, Amsterdam, The Netherlands"],"affiliations":[{"raw_affiliation_string":"Universlty of Amsterdam, Amsterdam, The Netherlands","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044511901","display_name":"Jaap Kamps","orcid":"https://orcid.org/0000-0002-6614-0087"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jaap Kamps","raw_affiliation_strings":["Universlty of Amsterdam, Amsterdam, The Netherlands"],"affiliations":[{"raw_affiliation_string":"Universlty of Amsterdam, Amsterdam, The Netherlands","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046069889","display_name":"Thaer Samar","orcid":"https://orcid.org/0000-0002-9872-5258"},"institutions":[{"id":"https://openalex.org/I1341640284","display_name":"Centrum Wiskunde & Informatica","ror":"https://ror.org/00x7ekv49","country_code":"NL","type":"facility","lineage":["https://openalex.org/I1341640284","https://openalex.org/I2800991832"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Thaer Samar","raw_affiliation_strings":["Centrum wiskunde & Informatica, Amsterdam, The Netherlands"],"affiliations":[{"raw_affiliation_string":"Centrum wiskunde & Informatica, Amsterdam, The Netherlands","institution_ids":["https://openalex.org/I1341640284"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5060259737","display_name":"Arjen P. de Vries","orcid":"https://orcid.org/0000-0002-2888-4202"},"institutions":[{"id":"https://openalex.org/I1341640284","display_name":"Centrum Wiskunde & Informatica","ror":"https://ror.org/00x7ekv49","country_code":"NL","type":"facility","lineage":["https://openalex.org/I1341640284","https://openalex.org/I2800991832"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Arjen P. de Vries","raw_affiliation_strings":["Centrum wiskunde & Informatica, Amsterdam, The Netherlands"],"affiliations":[{"raw_affiliation_string":"Centrum wiskunde & Informatica, Amsterdam, The Netherlands","institution_ids":["https://openalex.org/I1341640284"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5054097060"],"corresponding_institution_ids":["https://openalex.org/I865915315","https://openalex.org/I887064364"],"apc_list":null,"apc_paid":null,"fwci":1.6361,"has_fulltext":true,"cited_by_count":5,"citation_normalized_percentile":{"value":0.90278915,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":"9","issue":null,"first_page":"331","last_page":"340"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.9882000088691711,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.9847999811172485,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.8039290904998779},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7578134536743164},{"id":"https://openalex.org/keywords/crawling","display_name":"Crawling","score":0.7407727241516113},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.7175443172454834},{"id":"https://openalex.org/keywords/web-crawler","display_name":"Web crawler","score":0.7081645131111145},{"id":"https://openalex.org/keywords/hits-algorithm","display_name":"HITS algorithm","score":0.6198282241821289},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5938400030136108},{"id":"https://openalex.org/keywords/web-search-engine","display_name":"Web search engine","score":0.5134771466255188},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.49522653222084045},{"id":"https://openalex.org/keywords/static-web-page","display_name":"Static web page","score":0.49449774622917175},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4646109640598297},{"id":"https://openalex.org/keywords/site-map","display_name":"Site map","score":0.41144201159477234},{"id":"https://openalex.org/keywords/web-development","display_name":"Web development","score":0.2712366282939911},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.1327294409275055}],"concepts":[{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.8039290904998779},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7578134536743164},{"id":"https://openalex.org/C100368936","wikidata":"https://www.wikidata.org/wiki/Q1411725","display_name":"Crawling","level":2,"score":0.7407727241516113},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.7175443172454834},{"id":"https://openalex.org/C13743948","wikidata":"https://www.wikidata.org/wiki/Q45842","display_name":"Web crawler","level":2,"score":0.7081645131111145},{"id":"https://openalex.org/C195409031","wikidata":"https://www.wikidata.org/wiki/Q1031957","display_name":"HITS algorithm","level":5,"score":0.6198282241821289},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5938400030136108},{"id":"https://openalex.org/C521815418","wikidata":"https://www.wikidata.org/wiki/Q4182287","display_name":"Web search engine","level":4,"score":0.5134771466255188},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.49522653222084045},{"id":"https://openalex.org/C173576120","wikidata":"https://www.wikidata.org/wiki/Q2641220","display_name":"Static web page","level":4,"score":0.49449774622917175},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4646109640598297},{"id":"https://openalex.org/C67617509","wikidata":"https://www.wikidata.org/wiki/Q1503327","display_name":"Site map","level":5,"score":0.41144201159477234},{"id":"https://openalex.org/C79373723","wikidata":"https://www.wikidata.org/wiki/Q386275","display_name":"Web development","level":3,"score":0.2712366282939911},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.1327294409275055},{"id":"https://openalex.org/C105702510","wikidata":"https://www.wikidata.org/wiki/Q514","display_name":"Anatomy","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1109/jcdl.2014.6970188","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jcdl.2014.6970188","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Joint Conference on Digital Libraries","raw_type":"proceedings-article"},{"id":"pmh:oai:dare.uva.nl:openaire_cris_publications/98abd85e-42f5-4b39-a967-dcdf9432d641","is_oa":false,"landing_page_url":"https://dare.uva.nl/personal/pure/en/publications/finding-pages-on-the-unarchived-web(98abd85e-42f5-4b39-a967-dcdf9432d641).html","pdf_url":null,"source":{"id":"https://openalex.org/S4306400088","display_name":"UvA-DARE (University of Amsterdam)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I887064364","host_organization_name":"University of Amsterdam","host_organization_lineage":["https://openalex.org/I887064364"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Huurdeman, H C, Ben-David, A, Kamps, J, Samar, T & de Vries, A P 2014, Finding Pages on the Unarchived Web. in 2014 IEEE/ACM Joint Conference on Digital Libraries (JCDL) : 8th-12th September 2014, City University London, London, United Kingdom. [Piscataway, NJ], pp. 331-340, IEEE/ACM Joint Conference on Digital Libraries 2014 (JCDL), 8/09/14. https://doi.org/10.1109/JCDL.2014.6970188","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:cwi.nl:23336","is_oa":true,"landing_page_url":"https://ir.cwi.nl/pub/23336","pdf_url":"https://ir.cwi.nl/pub/23336/23336A.pdf","source":{"id":"https://openalex.org/S7407055335","display_name":"Centrum Wiskunde & Informatica (CWI), the national research institute for mathematics and computer science in the Netherlands","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"info:eu-repo/semantics/conferenceObject"},{"id":"pmh:oai:dare.uva.nl:publications/98abd85e-42f5-4b39-a967-dcdf9432d641","is_oa":false,"landing_page_url":"https://handle.uba.uva.nl/personal/pure/en/publications/finding-pages-on-the-unarchived-web(98abd85e-42f5-4b39-a967-dcdf9432d641).html","pdf_url":null,"source":{"id":"https://openalex.org/S4306400088","display_name":"UvA-DARE (University of Amsterdam)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I887064364","host_organization_name":"University of Amsterdam","host_organization_lineage":["https://openalex.org/I887064364"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Huurdeman, H C, Ben-David, A, Kamps, J, Samar, T & de Vries, A P 2014, Finding Pages on the Unarchived Web. in 2014 IEEE/ACM Joint Conference on Digital Libraries (JCDL) : 8th-12th September 2014, City University London, London, United Kingdom. [Piscataway, NJ], pp. 331-340, IEEE/ACM Joint Conference on Digital Libraries 2014 (JCDL), 8/09/14. https://doi.org/10.1109/JCDL.2014.6970188","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":{"id":"pmh:oai:cwi.nl:23336","is_oa":true,"landing_page_url":"https://ir.cwi.nl/pub/23336","pdf_url":"https://ir.cwi.nl/pub/23336/23336A.pdf","source":{"id":"https://openalex.org/S7407055335","display_name":"Centrum Wiskunde & Informatica (CWI), the national research institute for mathematics and computer science in the Netherlands","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"info:eu-repo/semantics/conferenceObject"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.46000000834465027,"id":"https://metadata.un.org/sdg/10"}],"awards":[{"id":"https://openalex.org/G2544847368","display_name":null,"funder_award_id":"640.005.001","funder_id":"https://openalex.org/F4320321800","funder_display_name":"Nederlandse Organisatie voor Wetenschappelijk Onderzoek"}],"funders":[{"id":"https://openalex.org/F4320321800","display_name":"Nederlandse Organisatie voor Wetenschappelijk Onderzoek","ror":"https://ror.org/04jsz6e67"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4237528028.pdf","grobid_xml":"https://content.openalex.org/works/W4237528028.grobid-xml"},"referenced_works_count":23,"referenced_works":["https://openalex.org/W751588","https://openalex.org/W1489893579","https://openalex.org/W1564027019","https://openalex.org/W1587224754","https://openalex.org/W1854214752","https://openalex.org/W1971772794","https://openalex.org/W2007571138","https://openalex.org/W2009140921","https://openalex.org/W2019194162","https://openalex.org/W2049718889","https://openalex.org/W2060370185","https://openalex.org/W2076086413","https://openalex.org/W2138621811","https://openalex.org/W2147057843","https://openalex.org/W2147872511","https://openalex.org/W2164052363","https://openalex.org/W2171710828","https://openalex.org/W2969798785","https://openalex.org/W4235567453","https://openalex.org/W4237417907","https://openalex.org/W4249960090","https://openalex.org/W6600029817","https://openalex.org/W6633977470"],"related_works":["https://openalex.org/W1527023341","https://openalex.org/W1486482441","https://openalex.org/W2008577673","https://openalex.org/W1536079598","https://openalex.org/W2371838993","https://openalex.org/W2088345472","https://openalex.org/W4385753583","https://openalex.org/W2172076436","https://openalex.org/W2997495867","https://openalex.org/W2377634017"],"abstract_inverted_index":{"Web":[0,25,82],"archives":[1],"preserve":[2],"the":[3,24,45,60,71,80,97,103,123,129,141,155],"fast":[4],"changing":[5],"Web,":[6,47],"yet":[7],"are":[8,77],"highly":[9,110],"incomplete":[10],"due":[11],"to":[12,31,40,136],"crawling":[13,15],"restrictions,":[14],"depth":[16],"and":[17,28,57,65,92,105],"frequency,":[18],"or":[19],"restrictive":[20],"selection":[21],"policies\u2014most":[22],"of":[23,44,51,62,85,89,99],"is":[26,132],"unarchived":[27,46,90,142],"therefore":[29],"lost":[30],"posterity.":[32],"In":[33],"this":[34,68],"paper,":[35],"we":[36,149],"propose":[37],"an":[38],"approach":[39,69],"recover":[41],"significant":[42],"parts":[43],"by":[48],"reconstructing":[49],"descriptions":[50,107],"these":[52,152],"pages":[53,91,114,118,139,153],"based":[54],"on":[55,70,140,158],"links":[56],"anchors":[58],"in":[59,144],"set":[61],"crawled":[63,81],"pages,":[64],"experiment":[66],"with":[67],"DutchWeb":[72],"archive.":[73,101],"Our":[74],"main":[75],"findings":[76],"threefold.":[78],"First,":[79],"contains":[83],"evidence":[84],"a":[86,109,145],"remarkable":[87],"number":[88],"websites,":[93],"potentially":[94],"dramatically":[95],"increasing":[96],"coverage":[98],"theWeb":[100],"Second,":[102],"link":[104],"anchor":[106],"have":[108,119],"skewed":[111],"distribution:":[112],"popular":[113],"such":[115],"as":[116],"home":[117],"more":[120],"terms,":[121],"but":[122],"richness":[124],"tapers":[125],"off":[126],"quickly.":[127],"Third,":[128],"succinct":[130],"representation":[131],"generally":[133],"rich":[134],"enough":[135],"uniquely":[137],"identify":[138],"Web:":[143],"known-item":[146],"search":[147],"setting":[148],"can":[150],"retrieve":[151],"within":[154],"first":[156],"ranks":[157],"average.":[159]},"counts_by_year":[{"year":2019,"cited_by_count":2},{"year":2018,"cited_by_count":1},{"year":2015,"cited_by_count":1},{"year":2014,"cited_by_count":1}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
