{"id":"https://openalex.org/W2326804586","doi":"https://doi.org/10.1109/tsc.2015.2414931","title":"SmartCrawler: A Two-Stage Crawler for Efficiently Harvesting Deep-Web Interfaces","display_name":"SmartCrawler: A Two-Stage Crawler for Efficiently Harvesting Deep-Web Interfaces","publication_year":2015,"publication_date":"2015-03-20","ids":{"openalex":"https://openalex.org/W2326804586","doi":"https://doi.org/10.1109/tsc.2015.2414931","mag":"2326804586"},"language":"en","primary_location":{"id":"doi:10.1109/tsc.2015.2414931","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tsc.2015.2414931","pdf_url":null,"source":{"id":"https://openalex.org/S204223317","display_name":"IEEE Transactions on Services Computing","issn_l":"1939-1374","issn":["1939-1374","2372-0204"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Services Computing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073659320","display_name":"Feng Zhao","orcid":"https://orcid.org/0000-0001-7205-3302"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Feng Zhao","raw_affiliation_strings":["Services Computing Technology and System Lab & Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Services Computing Technology and System Lab & Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101605477","display_name":"Jingyu Zhou","orcid":"https://orcid.org/0000-0003-1258-7243"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingyu Zhou","raw_affiliation_strings":["Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080056008","display_name":"Chang Nie","orcid":null},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chang Nie","raw_affiliation_strings":["Services Computing Technology and System Lab & Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Services Computing Technology and System Lab & Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101650393","display_name":"Heqing Huang","orcid":"https://orcid.org/0000-0001-8113-3531"},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Heqing Huang","raw_affiliation_strings":["Department of Computer Science and Engineering, The Pennsylvania State University, University Park, PA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Pennsylvania State University, University Park, PA","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5022262922","display_name":"Hai Jin","orcid":"https://orcid.org/0000-0002-3934-7605"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hai Jin","raw_affiliation_strings":["Services Computing Technology and System Lab & Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Services Computing Technology and System Lab & Cluster and Grid Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":25.5617,"has_fulltext":false,"cited_by_count":68,"citation_normalized_percentile":{"value":0.99422954,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":"9","issue":"4","first_page":"608","last_page":"620"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.9904000163078308,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.9864000082015991,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/web-crawler","display_name":"Web crawler","score":0.8953890800476074},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8672234416007996},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.5584965348243713},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5386497974395752},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.5125848650932312},{"id":"https://openalex.org/keywords/deep-web","display_name":"Deep Web","score":0.48618215322494507},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.47768837213516235},{"id":"https://openalex.org/keywords/pace","display_name":"Pace","score":0.4466714859008789},{"id":"https://openalex.org/keywords/result-set","display_name":"Result set","score":0.43669313192367554},{"id":"https://openalex.org/keywords/web-resource","display_name":"Web resource","score":0.43112125992774963},{"id":"https://openalex.org/keywords/focused-crawler","display_name":"Focused crawler","score":0.4287911057472229},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.4212648868560791},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.40896928310394287},{"id":"https://openalex.org/keywords/web-navigation","display_name":"Web navigation","score":0.33900225162506104},{"id":"https://openalex.org/keywords/the-internet","display_name":"The Internet","score":0.26957249641418457},{"id":"https://openalex.org/keywords/static-web-page","display_name":"Static web page","score":0.22135281562805176}],"concepts":[{"id":"https://openalex.org/C13743948","wikidata":"https://www.wikidata.org/wiki/Q45842","display_name":"Web crawler","level":2,"score":0.8953890800476074},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8672234416007996},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.5584965348243713},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5386497974395752},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.5125848650932312},{"id":"https://openalex.org/C46721378","wikidata":"https://www.wikidata.org/wiki/Q221989","display_name":"Deep Web","level":3,"score":0.48618215322494507},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.47768837213516235},{"id":"https://openalex.org/C2777526511","wikidata":"https://www.wikidata.org/wiki/Q691543","display_name":"Pace","level":2,"score":0.4466714859008789},{"id":"https://openalex.org/C4969071","wikidata":"https://www.wikidata.org/wiki/Q7316353","display_name":"Result set","level":3,"score":0.43669313192367554},{"id":"https://openalex.org/C65603577","wikidata":"https://www.wikidata.org/wiki/Q3427877","display_name":"Web resource","level":2,"score":0.43112125992774963},{"id":"https://openalex.org/C73340581","wikidata":"https://www.wikidata.org/wiki/Q5463958","display_name":"Focused crawler","level":5,"score":0.4287911057472229},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4212648868560791},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.40896928310394287},{"id":"https://openalex.org/C61096286","wikidata":"https://www.wikidata.org/wiki/Q7978592","display_name":"Web navigation","level":3,"score":0.33900225162506104},{"id":"https://openalex.org/C110875604","wikidata":"https://www.wikidata.org/wiki/Q75","display_name":"The Internet","level":2,"score":0.26957249641418457},{"id":"https://openalex.org/C173576120","wikidata":"https://www.wikidata.org/wiki/Q2641220","display_name":"Static web page","level":4,"score":0.22135281562805176},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tsc.2015.2414931","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tsc.2015.2414931","pdf_url":null,"source":{"id":"https://openalex.org/S204223317","display_name":"IEEE Transactions on Services Computing","issn_l":"1939-1374","issn":["1939-1374","2372-0204"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Services Computing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure","score":0.44999998807907104}],"awards":[{"id":"https://openalex.org/G7168593232","display_name":"\u57fa\u4e8e\u8bed\u4e49\u8ba1\u7b97\u7684\u6d77\u91cfDeep Web\u77e5\u8bc6\u63a2\u7d22\u673a\u5236\u7814\u7a76","funder_award_id":"61272411","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W291262356","https://openalex.org/W343945789","https://openalex.org/W1489992655","https://openalex.org/W1498286998","https://openalex.org/W1570649210","https://openalex.org/W1969831559","https://openalex.org/W1976974489","https://openalex.org/W1982769469","https://openalex.org/W1987746047","https://openalex.org/W2006784356","https://openalex.org/W2017726337","https://openalex.org/W2019106661","https://openalex.org/W2019560524","https://openalex.org/W2040468900","https://openalex.org/W2057556626","https://openalex.org/W2063862666","https://openalex.org/W2094930182","https://openalex.org/W2110034127","https://openalex.org/W2112923539","https://openalex.org/W2115457429","https://openalex.org/W2117058208","https://openalex.org/W2130760429","https://openalex.org/W2131006463","https://openalex.org/W2133990480","https://openalex.org/W2139259611","https://openalex.org/W2145102654","https://openalex.org/W2148738951","https://openalex.org/W2151306141","https://openalex.org/W2164896748","https://openalex.org/W2166763895","https://openalex.org/W2168826931","https://openalex.org/W2170188121","https://openalex.org/W2170971772","https://openalex.org/W2911388033","https://openalex.org/W2963639300","https://openalex.org/W6610375964","https://openalex.org/W6611560470","https://openalex.org/W6644511443","https://openalex.org/W6676595339","https://openalex.org/W6681291871","https://openalex.org/W6682597120","https://openalex.org/W6685116542"],"related_works":["https://openalex.org/W2015359389","https://openalex.org/W2390587322","https://openalex.org/W4233191086","https://openalex.org/W4243632275","https://openalex.org/W3108186202","https://openalex.org/W2183593492","https://openalex.org/W2214737927","https://openalex.org/W2367709753","https://openalex.org/W2384770049","https://openalex.org/W2352656474"],"abstract_inverted_index":{"As":[0],"deep":[1,36,58],"web":[2,29,59,135],"grows":[3],"at":[4],"a":[5,45,50,80,91,103,139,149,155],"very":[6],"fast":[7,112],"pace,":[8],"there":[9],"has":[10],"been":[11],"increased":[12],"interest":[13],"in":[14,133],"techniques":[15],"that":[16],"help":[17,74],"efficiently":[18,171],"locate":[19],"deep-web":[20,173],"interfaces.":[21,60],"However,":[22],"due":[23],"to":[24,97,144],"the":[25,32,62,73,107,161],"large":[26,81],"volume":[27],"of":[28,35,75,83,157,165],"resources":[30],"and":[31,41,163,178],"dynamic":[33],"nature":[34],"web,":[37],"achieving":[38],"wide":[39],"coverage":[40,147],"high":[42],"efficiency":[43],"is":[44],"challenging":[46],"issue.":[47],"We":[48],"propose":[49],"two-stage":[51],"framework,":[52,169],"namely":[53],"SmartCrawler,":[54],"for":[55,69,90,102,148],"efficient":[56],"harvesting":[57],"In":[61,106],"first":[63],"stage,":[64,109],"SmartCrawler":[65,94,110],"performs":[66],"site-based":[67],"searching":[68,114],"center":[70],"pages":[71],"with":[72,120],"search":[76],"engines,":[77],"avoiding":[78],"visiting":[79,128],"number":[82],"pages.":[84],"To":[85,124],"achieve":[86,145],"more":[87],"accurate":[88],"results":[89,153],"focused":[92],"crawl,":[93],"ranks":[95],"websites":[96],"prioritize":[98],"highly":[99,130],"relevant":[100,118,131],"ones":[101],"given":[104],"topic.":[105],"second":[108],"achieves":[111,179],"in-site":[113],"by":[115],"excavating":[116],"most":[117],"links":[119,132],"an":[121],"adaptive":[122],"link-ranking.":[123],"eliminate":[125],"bias":[126],"on":[127,154],"some":[129],"hidden":[134],"directories,":[136],"we":[137],"design":[138],"link":[140],"tree":[141],"data":[142],"structure":[143],"wider":[146],"website.":[150],"Our":[151],"experimental":[152],"set":[156],"representative":[158],"domains":[159],"show":[160],"agility":[162],"accuracy":[164],"our":[166],"proposed":[167],"crawler":[168],"which":[170],"retrieves":[172],"interfaces":[174],"from":[175],"large-scale":[176],"sites":[177],"higher":[180],"harvest":[181],"rates":[182],"than":[183],"other":[184],"crawlers.":[185]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":4},{"year":2021,"cited_by_count":14},{"year":2020,"cited_by_count":5},{"year":2019,"cited_by_count":7},{"year":2018,"cited_by_count":11},{"year":2017,"cited_by_count":11},{"year":2016,"cited_by_count":8},{"year":2015,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
