{"id":"https://openalex.org/W165838864","doi":"https://doi.org/10.3233/ica-2008-15403","title":"Detecting data records in semi-structured web sites based on text token clustering","display_name":"Detecting data records in semi-structured web sites based on text token clustering","publication_year":2008,"publication_date":"2008-07-31","ids":{"openalex":"https://openalex.org/W165838864","doi":"https://doi.org/10.3233/ica-2008-15403","mag":"165838864"},"language":"en","primary_location":{"id":"doi:10.3233/ica-2008-15403","is_oa":false,"landing_page_url":"https://doi.org/10.3233/ica-2008-15403","pdf_url":null,"source":{"id":"https://openalex.org/S107631664","display_name":"Integrated Computer-Aided Engineering","issn_l":"1069-2509","issn":["1069-2509","1875-8835"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310318577","host_organization_name":"IOS Press","host_organization_lineage":["https://openalex.org/P4310318577"],"host_organization_lineage_names":["IOS Press"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Integrated Computer-Aided Engineering","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5068382420","display_name":"Xiaoying Gao","orcid":"https://orcid.org/0000-0002-6326-7947"},"institutions":[{"id":"https://openalex.org/I41156924","display_name":"Victoria University of Wellington","ror":"https://ror.org/0040r6f76","country_code":"NZ","type":"education","lineage":["https://openalex.org/I41156924"]}],"countries":["NZ"],"is_corresponding":true,"raw_author_name":"Xiaoying Gao","raw_affiliation_strings":["School of Mathematics, Statistics and Computer Science, Victoria University of Wellington, PO Box 600, Wellington, New Zealand","School of Mathematics, Statistics, and Computer Science, Victoria University of Wellington, PO Box 600, Wellington, New Zealand"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Mathematics, Statistics and Computer Science, Victoria University of Wellington, PO Box 600, Wellington, New Zealand","institution_ids":["https://openalex.org/I41156924"]},{"raw_affiliation_string":"School of Mathematics, Statistics, and Computer Science, Victoria University of Wellington, PO Box 600, Wellington, New Zealand","institution_ids":["https://openalex.org/I41156924"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019454273","display_name":"Le Phong Bao Vuong","orcid":null},"institutions":[{"id":"https://openalex.org/I41156924","display_name":"Victoria University of Wellington","ror":"https://ror.org/0040r6f76","country_code":"NZ","type":"education","lineage":["https://openalex.org/I41156924"]}],"countries":["NZ"],"is_corresponding":false,"raw_author_name":"Le Phong Bao Vuong","raw_affiliation_strings":["School of Mathematics, Statistics and Computer Science, Victoria University of Wellington, PO Box 600, Wellington, New Zealand","School of Mathematics, Statistics, and Computer Science, Victoria University of Wellington, PO Box 600, Wellington, New Zealand"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Mathematics, Statistics and Computer Science, Victoria University of Wellington, PO Box 600, Wellington, New Zealand","institution_ids":["https://openalex.org/I41156924"]},{"raw_affiliation_string":"School of Mathematics, Statistics, and Computer Science, Victoria University of Wellington, PO Box 600, Wellington, New Zealand","institution_ids":["https://openalex.org/I41156924"]}]},{"author_position":"last","author":{"id":null,"display_name":"Mengjie Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I41156924","display_name":"Victoria University of Wellington","ror":"https://ror.org/0040r6f76","country_code":"NZ","type":"education","lineage":["https://openalex.org/I41156924"]}],"countries":["NZ"],"is_corresponding":false,"raw_author_name":"Mengjie Zhang","raw_affiliation_strings":["School of Mathematics, Statistics and Computer Science, Victoria University of Wellington, PO Box 600, Wellington, New Zealand","(Correspd. Tel.: +64 4 4635654; Fax: +64 4 4635045; E-mail: mengjie.zhang@mcs.vuw.ac.nz) School of Mathematics, Statistics and Computer Science, Victoria University of Wellington, PO Box 600, Well ...#TAB#"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Mathematics, Statistics and Computer Science, Victoria University of Wellington, PO Box 600, Wellington, New Zealand","institution_ids":["https://openalex.org/I41156924"]},{"raw_affiliation_string":"(Correspd. Tel.: +64 4 4635654; Fax: +64 4 4635045; E-mail: mengjie.zhang@mcs.vuw.ac.nz) School of Mathematics, Statistics and Computer Science, Victoria University of Wellington, PO Box 600, Well ...#TAB#","institution_ids":["https://openalex.org/I41156924"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5068382420"],"corresponding_institution_ids":["https://openalex.org/I41156924"],"apc_list":null,"apc_paid":null,"fwci":7.0012,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.96397611,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":"15","issue":"4","first_page":"297","last_page":"311"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.9886999726295471,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.9853000044822693,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.810123860836029},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.6958938837051392},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5690780282020569},{"id":"https://openalex.org/keywords/xml","display_name":"XML","score":0.5412774085998535},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.5343676209449768},{"id":"https://openalex.org/keywords/semi-structured-data","display_name":"Semi-structured data","score":0.5293577313423157},{"id":"https://openalex.org/keywords/string","display_name":"String (physics)","score":0.5240935683250427},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5112142562866211},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5087299942970276},{"id":"https://openalex.org/keywords/hierarchical-clustering","display_name":"Hierarchical clustering","score":0.4684242010116577},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4368148744106293},{"id":"https://openalex.org/keywords/html-element","display_name":"HTML element","score":0.42959344387054443},{"id":"https://openalex.org/keywords/document-clustering","display_name":"Document clustering","score":0.42534390091896057},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2508828639984131},{"id":"https://openalex.org/keywords/relational-database","display_name":"Relational database","score":0.22186842560768127},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.17602497339248657}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.810123860836029},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.6958938837051392},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5690780282020569},{"id":"https://openalex.org/C8797682","wikidata":"https://www.wikidata.org/wiki/Q2115","display_name":"XML","level":2,"score":0.5412774085998535},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.5343676209449768},{"id":"https://openalex.org/C40077939","wikidata":"https://www.wikidata.org/wiki/Q2336004","display_name":"Semi-structured data","level":3,"score":0.5293577313423157},{"id":"https://openalex.org/C157486923","wikidata":"https://www.wikidata.org/wiki/Q1376436","display_name":"String (physics)","level":2,"score":0.5240935683250427},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5112142562866211},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5087299942970276},{"id":"https://openalex.org/C92835128","wikidata":"https://www.wikidata.org/wiki/Q1277447","display_name":"Hierarchical clustering","level":3,"score":0.4684242010116577},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4368148744106293},{"id":"https://openalex.org/C81639021","wikidata":"https://www.wikidata.org/wiki/Q179551","display_name":"HTML element","level":3,"score":0.42959344387054443},{"id":"https://openalex.org/C177937566","wikidata":"https://www.wikidata.org/wiki/Q4223102","display_name":"Document clustering","level":3,"score":0.42534390091896057},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2508828639984131},{"id":"https://openalex.org/C5655090","wikidata":"https://www.wikidata.org/wiki/Q192588","display_name":"Relational database","level":2,"score":0.22186842560768127},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.17602497339248657},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.3233/ica-2008-15403","is_oa":false,"landing_page_url":"https://doi.org/10.3233/ica-2008-15403","pdf_url":null,"source":{"id":"https://openalex.org/S107631664","display_name":"Integrated Computer-Aided Engineering","issn_l":"1069-2509","issn":["1069-2509","1875-8835"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310318577","host_organization_name":"IOS Press","host_organization_lineage":["https://openalex.org/P4310318577"],"host_organization_lineage_names":["IOS Press"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Integrated Computer-Aided Engineering","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W179050280","https://openalex.org/W1487074248","https://openalex.org/W1524704912","https://openalex.org/W1531594516","https://openalex.org/W1533793522","https://openalex.org/W1553019137","https://openalex.org/W1570448133","https://openalex.org/W1691409177","https://openalex.org/W1723485573","https://openalex.org/W1956559956","https://openalex.org/W1992419399","https://openalex.org/W2005646337","https://openalex.org/W2015551056","https://openalex.org/W2022158118","https://openalex.org/W2087064593","https://openalex.org/W2095185981","https://openalex.org/W2104086170","https://openalex.org/W2119088569","https://openalex.org/W2129595335","https://openalex.org/W2134150392","https://openalex.org/W2143309843","https://openalex.org/W2153072229","https://openalex.org/W2153752143","https://openalex.org/W2155879015","https://openalex.org/W2162290558","https://openalex.org/W2165612380","https://openalex.org/W2421105961","https://openalex.org/W2966207845","https://openalex.org/W2999729612"],"related_works":["https://openalex.org/W4237492828","https://openalex.org/W78181647","https://openalex.org/W2130194910","https://openalex.org/W2189374779","https://openalex.org/W2605148547","https://openalex.org/W2016788389","https://openalex.org/W4241277841","https://openalex.org/W89027898","https://openalex.org/W2114818997","https://openalex.org/W165838864"],"abstract_inverted_index":{"This":[0],"paper":[1],"describes":[2],"a":[3,55,93],"new":[4,94,119,144,171],"approach":[5,38,120,145,172],"to":[6,32,83,96,100,116],"the":[7,40,68,71,77,102,143,167,170,180,187],"use":[8],"of":[9,70,105,136],"clustering":[10,47],"for":[11,111,148],"automatic":[12],"data":[13,72,78,149,189],"detection":[14,151,190],"in":[15],"semi-structured":[16],"web":[17,22,35,137,163],"pages.":[18,51,138],"Unlike":[19],"most":[20],"exiting":[21],"information":[23],"extraction":[24],"approaches":[25,159],"that":[26,142,153],"usually":[27],"apply":[28],"wrapper":[29,129],"induction":[30,130,183],"techniques":[31,48],"manually":[33],"labelled":[34,113],"pages,":[36],"this":[37,53],"avoids":[39,179],"pattern":[41,182],"inductio":[42],"n":[43],"process":[44,191],"by":[45],"using":[46],"on":[49,132,161],"unlabelled":[50],"In":[52],"approach,":[54],"variant":[56],"Hierarchical":[57],"Agglomerative":[58],"Clustering":[59],"(HAC)":[60],"algorithm":[61,110],"called":[62],"K-neighbours-HAC":[63],"is":[64,121,146,192],"developed":[65],"which":[66],"uses":[67],"similarities":[69],"format":[73],"(HTML":[74],"tags)":[75],"and":[76,108,123,152,177,185],"content":[79],"(text":[80],"string":[81],"values)":[82],"group":[84],"similar":[85],"text":[86,98,114],"tokens":[87,99,115],"into":[88],"clusters.":[89],"We":[90],"also":[91],"develop":[92],"method":[95],"label":[97],"capture":[101],"hierarchical":[103],"structure":[104],"HTML":[106],"pages":[107],"an":[109],"mapping":[112],"XML.":[117],"The":[118,139],"tested":[122],"compared":[124],"with":[125,166],"several":[126],"common":[127,157],"existing":[128,158,168],"systems":[131],"three":[133],"different":[134],"sets":[135],"results":[140],"suggest":[141],"effective":[147],"record":[150],"it":[154],"outperforms":[155],"these":[156,162],"examined":[160],"sites.":[164],"Compared":[165],"approaches,":[169],"does":[173],"not":[174],"require":[175],"training":[176],"successfully":[178],"explicit":[181],"process,":[184],"accordingly":[186],"entire":[188],"simpler.":[193]},"counts_by_year":[{"year":2015,"cited_by_count":2},{"year":2014,"cited_by_count":1},{"year":2013,"cited_by_count":1},{"year":2012,"cited_by_count":2}],"updated_date":"2026-05-03T06:03:33.228499","created_date":"2025-10-10T00:00:00"}
