{"id":"https://openalex.org/W2167138081","doi":"https://doi.org/10.1145/1281192.1281288","title":"Webpage understanding","display_name":"Webpage understanding","publication_year":2007,"publication_date":"2007-08-12","ids":{"openalex":"https://openalex.org/W2167138081","doi":"https://doi.org/10.1145/1281192.1281288","mag":"2167138081"},"language":"en","primary_location":{"id":"doi:10.1145/1281192.1281288","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1281192.1281288","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 13th ACM SIGKDD international conference on Knowledge discovery and data mining","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100606995","display_name":"Jun Zhu","orcid":"https://orcid.org/0000-0002-6254-2388"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jun Zhu","raw_affiliation_strings":["Tsinghua University"],"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100335187","display_name":"Bo Zhang","orcid":"https://orcid.org/0000-0002-9958-6181"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Zhang","raw_affiliation_strings":["Tsinghua University"],"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047496977","display_name":"Zaiqing Nie","orcid":"https://orcid.org/0000-0002-1134-2343"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zaiqing Nie","raw_affiliation_strings":["Microsoft Research Asia"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025631695","display_name":"Ji-Rong Wen","orcid":"https://orcid.org/0000-0002-9777-9676"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ji-Rong Wen","raw_affiliation_strings":["Microsoft Research Asia"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5109215896","display_name":"Hsiao-Wuen Hon","orcid":null},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hsiao-Wuen Hon","raw_affiliation_strings":["Microsoft Research Asia"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia","institution_ids":["https://openalex.org/I4210113369"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100606995"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":16.6153,"has_fulltext":false,"cited_by_count":42,"citation_normalized_percentile":{"value":0.98894445,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"903","last_page":"912"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.9775999784469604,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.9761999845504761,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.8428723812103271},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8289927244186401},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6921886205673218},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.6060804724693298},{"id":"https://openalex.org/keywords/tree","display_name":"Tree (set theory)","score":0.49106574058532715},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.46080824732780457},{"id":"https://openalex.org/keywords/html-element","display_name":"HTML element","score":0.4339718818664551},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.37093767523765564},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3669086694717407}],"concepts":[{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.8428723812103271},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8289927244186401},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6921886205673218},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.6060804724693298},{"id":"https://openalex.org/C113174947","wikidata":"https://www.wikidata.org/wiki/Q2859736","display_name":"Tree (set theory)","level":2,"score":0.49106574058532715},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.46080824732780457},{"id":"https://openalex.org/C81639021","wikidata":"https://www.wikidata.org/wiki/Q179551","display_name":"HTML element","level":3,"score":0.4339718818664551},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.37093767523765564},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3669086694717407},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/1281192.1281288","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1281192.1281288","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 13th ACM SIGKDD international conference on Knowledge discovery and data mining","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6600000262260437,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W111380827","https://openalex.org/W1553229631","https://openalex.org/W1574901103","https://openalex.org/W1973483159","https://openalex.org/W1976574743","https://openalex.org/W1978833300","https://openalex.org/W1996956037","https://openalex.org/W2021584966","https://openalex.org/W2029873015","https://openalex.org/W2048468185","https://openalex.org/W2088600132","https://openalex.org/W2096496923","https://openalex.org/W2104086170","https://openalex.org/W2113002560","https://openalex.org/W2115770258","https://openalex.org/W2128341918","https://openalex.org/W2135479443","https://openalex.org/W2137096228","https://openalex.org/W2140636749","https://openalex.org/W2143309843","https://openalex.org/W2147880316","https://openalex.org/W2150721933","https://openalex.org/W2158188757","https://openalex.org/W2158941357","https://openalex.org/W2160196229","https://openalex.org/W2161029186","https://openalex.org/W2162340487","https://openalex.org/W2166407869","https://openalex.org/W4233527139","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W1656419755","https://openalex.org/W1539890081","https://openalex.org/W2373402338","https://openalex.org/W2114620981","https://openalex.org/W4302086745","https://openalex.org/W2626998250","https://openalex.org/W2411679502","https://openalex.org/W2039658447","https://openalex.org/W1987716395","https://openalex.org/W1977746397"],"abstract_inverted_index":{"Recent":[0],"work":[1],"has":[2],"shown":[3],"the":[4,26,96,100,172],"effectiveness":[5],"of":[6,103,117,149,162,176],"leveraging":[7],"layout":[8,75],"and":[9,15,24,48,76,94,99,128,153,174],"tag-tree":[10,77],"structure":[11,78,98,119,139,152],"for":[12],"segmenting":[13],"webpages":[14],"labeling":[16],"HTML":[17,30],"elements.":[18],"However,":[19],"how":[20,72],"to":[21,73,83,92,123,158],"effectively":[22],"segment":[23,93],"label":[25,95],"text":[27,39,46,86,101,125,132,154],"contents":[28,40,87],"inside":[29],"elements":[31],"is":[32],"still":[33],"an":[34,159],"open":[35],"problem.":[36],"Since":[37],"many":[38],"on":[41,88,167],"a":[42,80,104,107],"webpage":[43,105,163],"are":[44,62],"often":[45],"fragments":[47],"not":[49],"strictly":[50],"grammatical,":[51],"traditional":[52],"natural":[53],"language":[54],"processing":[55],"techniques,":[56],"that":[57],"typically":[58],"expect":[59],"grammatical":[60],"sentences,":[61],"no":[63],"longer":[64],"directly":[65],"applicable.":[66],"In":[67,112],"this":[68,113],"paper,":[69],"we":[70],"examine":[71],"use":[74],"in":[79,106,137],"principled":[81],"way":[82],"help":[84,124],"understand":[85],"webpages.":[89],"We":[90],"propose":[91],"page":[97,118,138,151],"content":[102,126,155],"joint":[108],"discriminative":[109],"probabilistic":[110],"model.":[111],"model,":[114],"semantic":[115,129],"labels":[116,130],"can":[120,134],"be":[121,135],"leveraged":[122],"understanding,":[127],"ofthe":[131],"phrases":[133],"used":[136],"understanding":[140,156],"tasks":[141],"such":[142],"as":[143],"data":[144],"record":[145],"detection.":[146],"Thus,":[147],"integration":[148],"both":[150],"leads":[157],"integrated":[160],"solution":[161],"understanding.":[164],"Experimental":[165],"results":[166],"research":[168],"homepage":[169],"extraction":[170],"show":[171],"feasibility":[173],"promise":[175],"our":[177],"approach.":[178]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2021,"cited_by_count":2},{"year":2019,"cited_by_count":1},{"year":2018,"cited_by_count":2},{"year":2016,"cited_by_count":1},{"year":2015,"cited_by_count":1},{"year":2014,"cited_by_count":1},{"year":2013,"cited_by_count":1},{"year":2012,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
