{"id":"https://openalex.org/W1997597246","doi":"https://doi.org/10.1145/1871437.1871773","title":"Crawling the web for structured documents","display_name":"Crawling the web for structured documents","publication_year":2010,"publication_date":"2010-10-26","ids":{"openalex":"https://openalex.org/W1997597246","doi":"https://doi.org/10.1145/1871437.1871773","mag":"1997597246"},"language":"en","primary_location":{"id":"doi:10.1145/1871437.1871773","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1871437.1871773","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 19th ACM international conference on Information and knowledge management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048779508","display_name":"Juli\u00e1n Urbano","orcid":"https://orcid.org/0000-0003-2933-1949"},"institutions":[{"id":"https://openalex.org/I50357001","display_name":"Universidad Carlos III de Madrid","ror":"https://ror.org/03ths8210","country_code":"ES","type":"education","lineage":["https://openalex.org/I50357001"]}],"countries":["ES"],"is_corresponding":true,"raw_author_name":"Juli\u00e1n Urbano","raw_affiliation_strings":["University Carlos III of Madrid, Legan\u00e9s, Spain"],"affiliations":[{"raw_affiliation_string":"University Carlos III of Madrid, Legan\u00e9s, Spain","institution_ids":["https://openalex.org/I50357001"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109108184","display_name":"Juan Lor\u00e9ns","orcid":null},"institutions":[{"id":"https://openalex.org/I50357001","display_name":"Universidad Carlos III de Madrid","ror":"https://ror.org/03ths8210","country_code":"ES","type":"education","lineage":["https://openalex.org/I50357001"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Juan Lor\u00e9ns","raw_affiliation_strings":["University Carlos III of Madrid, Legan\u00e9s, Spain"],"affiliations":[{"raw_affiliation_string":"University Carlos III of Madrid, Legan\u00e9s, Spain","institution_ids":["https://openalex.org/I50357001"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075148792","display_name":"Yorgos Andreadakis","orcid":null},"institutions":[{"id":"https://openalex.org/I50357001","display_name":"Universidad Carlos III de Madrid","ror":"https://ror.org/03ths8210","country_code":"ES","type":"education","lineage":["https://openalex.org/I50357001"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Yorgos Andreadakis","raw_affiliation_strings":["University Carlos III of Madrid, Legan\u00e9s, Spain"],"affiliations":[{"raw_affiliation_string":"University Carlos III of Madrid, Legan\u00e9s, Spain","institution_ids":["https://openalex.org/I50357001"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5047898572","display_name":"M\u00f3nica Marrero","orcid":"https://orcid.org/0000-0002-2359-6340"},"institutions":[{"id":"https://openalex.org/I50357001","display_name":"Universidad Carlos III de Madrid","ror":"https://ror.org/03ths8210","country_code":"ES","type":"education","lineage":["https://openalex.org/I50357001"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"M\u00f3nica Marrero","raw_affiliation_strings":["University Carlos III of Madrid, Legan\u00e9s, Spain"],"affiliations":[{"raw_affiliation_string":"University Carlos III of Madrid, Legan\u00e9s, Spain","institution_ids":["https://openalex.org/I50357001"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5048779508"],"corresponding_institution_ids":["https://openalex.org/I50357001"],"apc_list":null,"apc_paid":null,"fwci":2.1445,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.90882121,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"1939","last_page":"1940"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8167037963867188},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.7669733762741089},{"id":"https://openalex.org/keywords/web-crawler","display_name":"Web crawler","score":0.6178495287895203},{"id":"https://openalex.org/keywords/xml","display_name":"XML","score":0.5346651673316956},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5145549178123474},{"id":"https://openalex.org/keywords/semantic-web","display_name":"Semantic Web","score":0.4531833827495575},{"id":"https://openalex.org/keywords/rdf","display_name":"RDF","score":0.4287964701652527},{"id":"https://openalex.org/keywords/web-standards","display_name":"Web standards","score":0.41120463609695435},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.3909541964530945}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8167037963867188},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.7669733762741089},{"id":"https://openalex.org/C13743948","wikidata":"https://www.wikidata.org/wiki/Q45842","display_name":"Web crawler","level":2,"score":0.6178495287895203},{"id":"https://openalex.org/C8797682","wikidata":"https://www.wikidata.org/wiki/Q2115","display_name":"XML","level":2,"score":0.5346651673316956},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5145549178123474},{"id":"https://openalex.org/C2129575","wikidata":"https://www.wikidata.org/wiki/Q54837","display_name":"Semantic Web","level":2,"score":0.4531833827495575},{"id":"https://openalex.org/C147497476","wikidata":"https://www.wikidata.org/wiki/Q54872","display_name":"RDF","level":3,"score":0.4287964701652527},{"id":"https://openalex.org/C182321512","wikidata":"https://www.wikidata.org/wiki/Q1153289","display_name":"Web standards","level":3,"score":0.41120463609695435},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.3909541964530945}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/1871437.1871773","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1871437.1871773","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 19th ACM international conference on Information and knowledge management","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.714.6105","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.714.6105","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://monica-marrero.com/files/paperCIKMcrawling.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":5,"referenced_works":["https://openalex.org/W238053929","https://openalex.org/W1519066168","https://openalex.org/W2008427173","https://openalex.org/W2054173953","https://openalex.org/W2147519594"],"related_works":["https://openalex.org/W2389761961","https://openalex.org/W2963706618","https://openalex.org/W2113184419","https://openalex.org/W1999548128","https://openalex.org/W4254300012","https://openalex.org/W2102475112","https://openalex.org/W2776293731","https://openalex.org/W2610919777","https://openalex.org/W2965230088","https://openalex.org/W1534806717"],"abstract_inverted_index":{"Structured":[0],"Information":[1],"Retrieval":[2],"is":[3,17,59,65],"gaining":[4],"a":[5,109],"lot":[6],"of":[7,15,30,56,71,78,118,135,144,152,155],"interest":[8],"in":[9,101],"recent":[10],"years,":[11],"as":[12,26,158],"this":[13],"kind":[14,117,143],"information":[16],"becoming":[18],"an":[19],"invaluable":[20],"asset":[21],"for":[22,76,115,162,168],"professional":[23],"communities":[24],"such":[25,157],"Software":[27],"Engineering.":[28],"Most":[29],"the":[31,54,60,63,140],"research":[32],"has":[33],"focused":[34,49,112],"on":[35,50],"XML":[36,57],"documents,":[37,120,156],"with":[38,67,124],"initiatives":[39],"like":[40,92,104],"INEX":[41],"to":[42,127,131],"bring":[43],"together":[44],"and":[45,94,111,121],"evaluate":[46],"new":[47],"techniques":[48],"structured":[51,72,119,137],"information.":[52],"Despite":[53],"use":[55],"documents":[58,82,138],"immediate":[61],"choice,":[62],"Web":[64,89],"filled":[66],"several":[68],"other":[69,79,153],"types":[70,154],"information,":[73],"which":[74],"account":[75],"millions":[77],"documents.":[80],"These":[81],"may":[83],"be":[84],"collected":[85],"directly":[86],"using":[87],"standard":[88],"search":[90,99,164],"engines":[91,165],"Google":[93],"Yahoo,":[95],"or":[96,166],"following":[97],"specific":[98],"patterns":[100],"online":[102],"repositories":[103],"SourceForge.":[105],"This":[106,142],"demo":[107],"describes":[108],"distributed":[110],"web":[113],"crawler":[114],"any":[116],"we":[122],"show":[123],"it":[125],"how":[126],"exploit":[128],"general-purpose":[129],"resources":[130],"gather":[132],"large":[133,149],"amounts":[134],"real-world":[136],"off":[139],"Web.":[141],"tool":[145],"could":[146],"help":[147],"building":[148],"test":[150],"collections":[151],"Java":[159],"source":[160],"code":[161],"software-oriented":[163],"RDF":[167],"semantic":[169],"searching.":[170]},"counts_by_year":[{"year":2019,"cited_by_count":1},{"year":2013,"cited_by_count":1}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
