{"id":"https://openalex.org/W2060851473","doi":"https://doi.org/10.1109/coginf.2010.5599733","title":"A cognitive crawler using structure pattern for incremental crawling and content extraction","display_name":"A cognitive crawler using structure pattern for incremental crawling and content extraction","publication_year":2010,"publication_date":"2010-07-01","ids":{"openalex":"https://openalex.org/W2060851473","doi":"https://doi.org/10.1109/coginf.2010.5599733","mag":"2060851473"},"language":"en","primary_location":{"id":"doi:10.1109/coginf.2010.5599733","is_oa":false,"landing_page_url":"https://doi.org/10.1109/coginf.2010.5599733","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"9th IEEE International Conference on Cognitive Informatics (ICCI'10)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027299010","display_name":"Shijia Xi","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Shijia Xi","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055546056","display_name":"Fuchun Sun","orcid":"https://orcid.org/0000-0003-3546-6305"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fuchun Sun","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100373517","display_name":"Jianmin Wang","orcid":"https://orcid.org/0000-0001-6841-7943"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianmin Wang","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5027299010"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.15041619,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":"8","issue":null,"first_page":"238","last_page":"244"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13976","display_name":"Web visibility and informetrics","score":0.9905999898910522,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.9233999848365784,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/crawling","display_name":"Crawling","score":0.9270063638687134},{"id":"https://openalex.org/keywords/web-crawler","display_name":"Web crawler","score":0.9261448979377747},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7953745126724243},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.6616818904876709},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.6053958535194397},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5379875302314758},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5060972571372986},{"id":"https://openalex.org/keywords/hierarchical-clustering","display_name":"Hierarchical clustering","score":0.44233113527297974},{"id":"https://openalex.org/keywords/focused-crawler","display_name":"Focused crawler","score":0.42465198040008545},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.311831533908844},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.21121180057525635},{"id":"https://openalex.org/keywords/static-web-page","display_name":"Static web page","score":0.15572622418403625},{"id":"https://openalex.org/keywords/web-navigation","display_name":"Web navigation","score":0.0964823067188263}],"concepts":[{"id":"https://openalex.org/C100368936","wikidata":"https://www.wikidata.org/wiki/Q1411725","display_name":"Crawling","level":2,"score":0.9270063638687134},{"id":"https://openalex.org/C13743948","wikidata":"https://www.wikidata.org/wiki/Q45842","display_name":"Web crawler","level":2,"score":0.9261448979377747},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7953745126724243},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.6616818904876709},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.6053958535194397},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5379875302314758},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5060972571372986},{"id":"https://openalex.org/C92835128","wikidata":"https://www.wikidata.org/wiki/Q1277447","display_name":"Hierarchical clustering","level":3,"score":0.44233113527297974},{"id":"https://openalex.org/C73340581","wikidata":"https://www.wikidata.org/wiki/Q5463958","display_name":"Focused crawler","level":5,"score":0.42465198040008545},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.311831533908844},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.21121180057525635},{"id":"https://openalex.org/C173576120","wikidata":"https://www.wikidata.org/wiki/Q2641220","display_name":"Static web page","level":4,"score":0.15572622418403625},{"id":"https://openalex.org/C61096286","wikidata":"https://www.wikidata.org/wiki/Q7978592","display_name":"Web navigation","level":3,"score":0.0964823067188263},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.0},{"id":"https://openalex.org/C105702510","wikidata":"https://www.wikidata.org/wiki/Q514","display_name":"Anatomy","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/coginf.2010.5599733","is_oa":false,"landing_page_url":"https://doi.org/10.1109/coginf.2010.5599733","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"9th IEEE International Conference on Cognitive Informatics (ICCI'10)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W1546842576","https://openalex.org/W1566984846","https://openalex.org/W1803802947","https://openalex.org/W1999548128","https://openalex.org/W2054729486","https://openalex.org/W2055950890","https://openalex.org/W2097704339","https://openalex.org/W2099126271","https://openalex.org/W2358428519","https://openalex.org/W4254300012","https://openalex.org/W6632830464","https://openalex.org/W6634031162","https://openalex.org/W6650306682","https://openalex.org/W6674623803"],"related_works":["https://openalex.org/W4385695127","https://openalex.org/W1506122440","https://openalex.org/W2026132847","https://openalex.org/W3160284424","https://openalex.org/W2783570127","https://openalex.org/W3179756121","https://openalex.org/W2772576376","https://openalex.org/W2783231802","https://openalex.org/W2051135816","https://openalex.org/W2899211245"],"abstract_inverted_index":{"In":[0,109],"this":[1,217],"paper,":[2],"we":[3,90,112,138,185,194],"design":[4],"a":[5,58,68,73,93,114],"cognitive":[6,218],"crawler":[7,54,149,219],"to":[8,35,55,75,100,125,143,167],"dramatically":[9],"reduce":[10],"the":[11,31,103,110,129,178,191,214],"website":[12],"crawling":[13,32,173,183],"cost":[14,33],"and":[15,41,81,95,121,160,172,213,223,231],"extract":[16,151,196],"useful":[17,152,212],"content":[18,197],"from":[19],"web":[20,155],"pages":[21,40,44,80,84,156,199],"in":[22,47,132],"an":[23,140,181],"unsupervised":[24,141],"procedure.":[25],"The":[26,203],"main":[27],"idea":[28],"of":[29,106,154,180,198,216,229,237],"reducing":[30],"is":[34,50,210,220,232],"retrieving":[36],"those":[37,77],"lately":[38,78],"modified":[39,62,79,104,170],"newly":[42,64,82],"added":[43,65,83],"only.":[45],"However,":[46],"reality,":[48],"it":[49,224],"impossible":[51],"for":[52,234],"traditional":[53],"judge":[56],"whether":[57],"page":[59],"has":[60],"been":[61],"or":[63],"without":[66,85],"doing":[67],"whole":[69],"crawling.":[70],"We":[71,162],"propose":[72],"method":[74,117],"predict":[76,168],"do":[86],"any":[87],"actual":[88],"crawling;":[89],"also":[91,163],"find":[92,127],"feasible":[94,165],"stable":[96],"feature":[97],"\"structure":[98],"pattern\"":[99],"better":[101],"indicates":[102],"probability":[105],"certain":[107,133],"page.":[108],"meanwhile,":[111],"develop":[113],"hybrid":[115],"clustering":[116,124],"combined":[118],"with":[119,200],"K-means":[120],"agglomerative":[122],"hierarchical":[123],"automatically":[126],"all":[128],"structure":[130,136,208],"patterns":[131],"website.":[134],"Using":[135,190],"pattern,":[137],"developed":[139],"algorithm":[142,192],"generate":[144],"website's":[145],"templates;":[146],"using":[147],"templates,":[148],"can":[150,225],"information":[153],"much":[157],"more":[158],"easily":[159],"precisely.":[161],"introduce":[164],"formulas":[166],"pages'":[169],"probabilities":[171],"time":[174],"intervals.":[175],"To":[176],"evaluate":[177],"performance":[179,215],"incremental":[182],"algorithm,":[184],"proposed":[186],"three":[187],"new":[188],"indicators.":[189],"proposed,":[193],"could":[195],"high":[201],"performance.":[202],"experimental":[204],"results":[205],"illustrate":[206],"that":[207],"pattern":[209],"very":[211],"quite":[221],"promising":[222],"save":[226],"huge":[227],"amount":[228],"bandwidth":[230],"qualified":[233],"different":[235],"websites":[236],"various":[238],"scales.":[239]},"counts_by_year":[{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
