{"id":"https://openalex.org/W2895426549","doi":"https://doi.org/10.1145/3209280.3229115","title":"Main Content Detection in HTML Journal Articles","display_name":"Main Content Detection in HTML Journal Articles","publication_year":2018,"publication_date":"2018-08-28","ids":{"openalex":"https://openalex.org/W2895426549","doi":"https://doi.org/10.1145/3209280.3229115","mag":"2895426549"},"language":"en","primary_location":{"id":"doi:10.1145/3209280.3229115","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3209280.3229115","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Symposium on Document Engineering 2018","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5057176536","display_name":"Alastair R. Rae","orcid":"https://orcid.org/0000-0003-4675-0627"},"institutions":[{"id":"https://openalex.org/I2800548410","display_name":"United States National Library of Medicine","ror":"https://ror.org/0060t0j89","country_code":"US","type":"archive","lineage":["https://openalex.org/I1299022934","https://openalex.org/I1299303238","https://openalex.org/I2800548410"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Alastair R. Rae","raw_affiliation_strings":["National Library of Medicine, Bethesda, Maryland"],"affiliations":[{"raw_affiliation_string":"National Library of Medicine, Bethesda, Maryland","institution_ids":["https://openalex.org/I2800548410"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100639725","display_name":"Jongwoo Kim","orcid":"https://orcid.org/0000-0001-5656-4436"},"institutions":[{"id":"https://openalex.org/I2800548410","display_name":"United States National Library of Medicine","ror":"https://ror.org/0060t0j89","country_code":"US","type":"archive","lineage":["https://openalex.org/I1299022934","https://openalex.org/I1299303238","https://openalex.org/I2800548410"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jongwoo Kim","raw_affiliation_strings":["National Library of Medicine, Bethesda, Maryland"],"affiliations":[{"raw_affiliation_string":"National Library of Medicine, Bethesda, Maryland","institution_ids":["https://openalex.org/I2800548410"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101509629","display_name":"Daniel Le","orcid":"https://orcid.org/0000-0002-0220-6366"},"institutions":[{"id":"https://openalex.org/I2800548410","display_name":"United States National Library of Medicine","ror":"https://ror.org/0060t0j89","country_code":"US","type":"archive","lineage":["https://openalex.org/I1299022934","https://openalex.org/I1299303238","https://openalex.org/I2800548410"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Daniel Le","raw_affiliation_strings":["National Library of Medicine, Bethesda, Maryland"],"affiliations":[{"raw_affiliation_string":"National Library of Medicine, Bethesda, Maryland","institution_ids":["https://openalex.org/I2800548410"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103701816","display_name":"George R. Thoma","orcid":null},"institutions":[{"id":"https://openalex.org/I2800548410","display_name":"United States National Library of Medicine","ror":"https://ror.org/0060t0j89","country_code":"US","type":"archive","lineage":["https://openalex.org/I1299022934","https://openalex.org/I1299303238","https://openalex.org/I2800548410"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"George R. Thoma","raw_affiliation_strings":["National Library of Medicine, Bethesda, Maryland"],"affiliations":[{"raw_affiliation_string":"National Library of Medicine, Bethesda, Maryland","institution_ids":["https://openalex.org/I2800548410"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5057176536"],"corresponding_institution_ids":["https://openalex.org/I2800548410"],"apc_list":null,"apc_paid":null,"fwci":0.3927,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.71644461,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":93},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"4"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9588000178337097,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9287999868392944,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8108940124511719},{"id":"https://openalex.org/keywords/paragraph","display_name":"Paragraph","score":0.7611678838729858},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.7090486288070679},{"id":"https://openalex.org/keywords/html-element","display_name":"HTML element","score":0.7066166400909424},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.6970459222793579},{"id":"https://openalex.org/keywords/markup-language","display_name":"Markup language","score":0.590645432472229},{"id":"https://openalex.org/keywords/precision-and-recall","display_name":"Precision and recall","score":0.5827435851097107},{"id":"https://openalex.org/keywords/column","display_name":"Column (typography)","score":0.5765610337257385},{"id":"https://openalex.org/keywords/web-content","display_name":"Web content","score":0.5351954698562622},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5328431725502014},{"id":"https://openalex.org/keywords/hits-algorithm","display_name":"HITS algorithm","score":0.5057716369628906},{"id":"https://openalex.org/keywords/page-layout","display_name":"Page layout","score":0.4472995102405548},{"id":"https://openalex.org/keywords/html","display_name":"HTML","score":0.41278672218322754},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.3689466714859009},{"id":"https://openalex.org/keywords/static-web-page","display_name":"Static web page","score":0.3445560932159424},{"id":"https://openalex.org/keywords/xml","display_name":"XML","score":0.2068137526512146},{"id":"https://openalex.org/keywords/web-navigation","display_name":"Web navigation","score":0.1142025887966156},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.07755199074745178}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8108940124511719},{"id":"https://openalex.org/C2777206241","wikidata":"https://www.wikidata.org/wiki/Q194431","display_name":"Paragraph","level":2,"score":0.7611678838729858},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.7090486288070679},{"id":"https://openalex.org/C81639021","wikidata":"https://www.wikidata.org/wiki/Q179551","display_name":"HTML element","level":3,"score":0.7066166400909424},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.6970459222793579},{"id":"https://openalex.org/C45874996","wikidata":"https://www.wikidata.org/wiki/Q37045","display_name":"Markup language","level":3,"score":0.590645432472229},{"id":"https://openalex.org/C81669768","wikidata":"https://www.wikidata.org/wiki/Q2359161","display_name":"Precision and recall","level":2,"score":0.5827435851097107},{"id":"https://openalex.org/C2780551164","wikidata":"https://www.wikidata.org/wiki/Q2306599","display_name":"Column (typography)","level":3,"score":0.5765610337257385},{"id":"https://openalex.org/C2776324614","wikidata":"https://www.wikidata.org/wiki/Q3948731","display_name":"Web content","level":3,"score":0.5351954698562622},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5328431725502014},{"id":"https://openalex.org/C195409031","wikidata":"https://www.wikidata.org/wiki/Q1031957","display_name":"HITS algorithm","level":5,"score":0.5057716369628906},{"id":"https://openalex.org/C188985296","wikidata":"https://www.wikidata.org/wiki/Q868954","display_name":"Page layout","level":2,"score":0.4472995102405548},{"id":"https://openalex.org/C138708601","wikidata":"https://www.wikidata.org/wiki/Q8811","display_name":"HTML","level":3,"score":0.41278672218322754},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.3689466714859009},{"id":"https://openalex.org/C173576120","wikidata":"https://www.wikidata.org/wiki/Q2641220","display_name":"Static web page","level":4,"score":0.3445560932159424},{"id":"https://openalex.org/C8797682","wikidata":"https://www.wikidata.org/wiki/Q2115","display_name":"XML","level":2,"score":0.2068137526512146},{"id":"https://openalex.org/C61096286","wikidata":"https://www.wikidata.org/wiki/Q7978592","display_name":"Web navigation","level":3,"score":0.1142025887966156},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.07755199074745178},{"id":"https://openalex.org/C112698675","wikidata":"https://www.wikidata.org/wiki/Q37038","display_name":"Advertising","level":1,"score":0.0},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3209280.3229115","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3209280.3229115","pdf_url":null,"source":null,"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Symposium on Document Engineering 2018","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.4099999964237213}],"awards":[],"funders":[{"id":"https://openalex.org/F4320337376","display_name":"NIH Clinical Center","ror":"https://ror.org/04vfsmv21"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W1803802947","https://openalex.org/W1832894994","https://openalex.org/W2018295102","https://openalex.org/W2040075907","https://openalex.org/W2084358158","https://openalex.org/W2120101509","https://openalex.org/W2158051716"],"related_works":["https://openalex.org/W4255056669","https://openalex.org/W2019215039","https://openalex.org/W622138672","https://openalex.org/W2913428691","https://openalex.org/W598961908","https://openalex.org/W2508179278","https://openalex.org/W4243796650","https://openalex.org/W169337252","https://openalex.org/W4309285432","https://openalex.org/W2992855911"],"abstract_inverted_index":{"Web":[0],"content":[1,13,45],"extraction":[2],"algorithms":[3],"have":[4],"been":[5],"shown":[6],"to":[7,98,124],"improve":[8],"the":[9,72,76,91,118],"performance":[10],"of":[11,84,117],"web":[12,20,56],"analysis":[14,41,94,120],"tasks.":[15],"This":[16,33],"is":[17],"because":[18],"noisy":[19],"page":[21,57],"content,":[22],"such":[23],"as":[24],"advertisements":[25],"and":[26,38,69,115,128],"navigation":[27],"links,":[28],"can":[29],"significantly":[30,99],"degrade":[31],"performance.":[32],"paper":[34],"presents":[35],"a":[36,55,81],"novel":[37],"effective":[39],"layout":[40,93,119],"algorithm":[42,52,95,105,121],"for":[43],"main":[44],"detection":[46],"in":[47],"HTML":[48,88,108],"journal":[49,89],"articles.":[50],"The":[51,112],"first":[53],"segments":[54],"based":[58,64,106],"on":[59,65,107],"rendered":[60],"line":[61],"breaks,":[62],"then":[63],"its":[66],"column":[67,73],"structure,":[68],"finally":[70],"identifies":[71],"that":[74],"contains":[75],"most":[77],"paragraph":[78],"text.":[79],"On":[80],"test":[82],"set":[83],"359":[85],"manually":[86],"labeled":[87],"articles,":[90],"proposed":[92],"was":[96],"found":[97],"outperform":[100],"an":[101],"alternative":[102],"semantic":[103,110],"markup":[104],"5":[109],"tags.":[111],"precision,":[113],"recall,":[114],"F-score":[116],"were":[122],"measured":[123],"be":[125],"0.96,":[126],"0.99,":[127],"0.98":[129],"respectively.":[130]},"counts_by_year":[{"year":2021,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
