{"id":"https://openalex.org/W3021347373","doi":"https://doi.org/10.1145/3366424.3383547","title":"Boilerplate Removal using a Neural Sequence Labeling Model","display_name":"Boilerplate Removal using a Neural Sequence Labeling Model","publication_year":2020,"publication_date":"2020-04-20","ids":{"openalex":"https://openalex.org/W3021347373","doi":"https://doi.org/10.1145/3366424.3383547","mag":"3021347373"},"language":"en","primary_location":{"id":"doi:10.1145/3366424.3383547","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3366424.3383547","pdf_url":null,"source":{"id":"https://openalex.org/S4306506651","display_name":"Companion Proceedings of the Web Conference 2020","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the Web Conference 2020","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1145/3366424.3383547","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Jurek Leonhardt","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136150","display_name":"L3S Research Center","ror":"https://ror.org/039t4wk02","country_code":"DE","type":"facility","lineage":["https://openalex.org/I114112103","https://openalex.org/I4210136150","https://openalex.org/I94509681"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Jurek Leonhardt","raw_affiliation_strings":["L3S Research Center Hannover, Germany"],"affiliations":[{"raw_affiliation_string":"L3S Research Center Hannover, Germany","institution_ids":["https://openalex.org/I4210136150"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Avishek Anand","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136150","display_name":"L3S Research Center","ror":"https://ror.org/039t4wk02","country_code":"DE","type":"facility","lineage":["https://openalex.org/I114112103","https://openalex.org/I4210136150","https://openalex.org/I94509681"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Avishek Anand","raw_affiliation_strings":["L3S Research Center Hannover, Germany"],"affiliations":[{"raw_affiliation_string":"L3S Research Center Hannover, Germany","institution_ids":["https://openalex.org/I4210136150"]}]},{"author_position":"last","author":{"id":null,"display_name":"Megha Khosla","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136150","display_name":"L3S Research Center","ror":"https://ror.org/039t4wk02","country_code":"DE","type":"facility","lineage":["https://openalex.org/I114112103","https://openalex.org/I4210136150","https://openalex.org/I94509681"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Megha Khosla","raw_affiliation_strings":["L3S Research Center Hannover, Germany"],"affiliations":[{"raw_affiliation_string":"L3S Research Center Hannover, Germany","institution_ids":["https://openalex.org/I4210136150"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I4210136150"],"apc_list":null,"apc_paid":null,"fwci":4.6262,"has_fulltext":false,"cited_by_count":19,"citation_normalized_percentile":{"value":0.95,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":93,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"226","last_page":"229"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9542999863624573,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9542999863624573,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.014299999922513962,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.007600000128149986,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.6385999917984009},{"id":"https://openalex.org/keywords/boilerplate-text","display_name":"Boilerplate text","score":0.5544000267982483},{"id":"https://openalex.org/keywords/usability","display_name":"Usability","score":0.4352000057697296},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.4142000079154968},{"id":"https://openalex.org/keywords/html-element","display_name":"HTML element","score":0.4023999869823456},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.38109999895095825},{"id":"https://openalex.org/keywords/static-web-page","display_name":"Static web page","score":0.3538999855518341},{"id":"https://openalex.org/keywords/sequence-labeling","display_name":"Sequence labeling","score":0.3443000018596649}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7854999899864197},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.6385999917984009},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5558000206947327},{"id":"https://openalex.org/C75701414","wikidata":"https://www.wikidata.org/wiki/Q1651672","display_name":"Boilerplate text","level":2,"score":0.5544000267982483},{"id":"https://openalex.org/C170130773","wikidata":"https://www.wikidata.org/wiki/Q216378","display_name":"Usability","level":2,"score":0.4352000057697296},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.4142000079154968},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.4101000130176544},{"id":"https://openalex.org/C81639021","wikidata":"https://www.wikidata.org/wiki/Q179551","display_name":"HTML element","level":3,"score":0.4023999869823456},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3824000060558319},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.38109999895095825},{"id":"https://openalex.org/C173576120","wikidata":"https://www.wikidata.org/wiki/Q2641220","display_name":"Static web page","level":4,"score":0.3538999855518341},{"id":"https://openalex.org/C35639132","wikidata":"https://www.wikidata.org/wiki/Q7452468","display_name":"Sequence labeling","level":3,"score":0.3443000018596649},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.34220001101493835},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3409000039100647},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3393000066280365},{"id":"https://openalex.org/C138708601","wikidata":"https://www.wikidata.org/wiki/Q8811","display_name":"HTML","level":3,"score":0.3361999988555908},{"id":"https://openalex.org/C61096286","wikidata":"https://www.wikidata.org/wiki/Q7978592","display_name":"Web navigation","level":3,"score":0.3325999975204468},{"id":"https://openalex.org/C2776324614","wikidata":"https://www.wikidata.org/wiki/Q3948731","display_name":"Web content","level":3,"score":0.33059999346733093},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.32850000262260437},{"id":"https://openalex.org/C2984519610","wikidata":"https://www.wikidata.org/wiki/Q35127","display_name":"Web site","level":3,"score":0.2971999943256378},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2770000100135803},{"id":"https://openalex.org/C2983909278","wikidata":"https://www.wikidata.org/wiki/Q6368","display_name":"Web browser","level":3,"score":0.273499995470047},{"id":"https://openalex.org/C195274430","wikidata":"https://www.wikidata.org/wiki/Q1650567","display_name":"Client-side scripting","level":5,"score":0.26350000500679016},{"id":"https://openalex.org/C67617509","wikidata":"https://www.wikidata.org/wiki/Q1503327","display_name":"Site map","level":5,"score":0.25290000438690186},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.25040000677108765}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3366424.3383547","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3366424.3383547","pdf_url":null,"source":{"id":"https://openalex.org/S4306506651","display_name":"Companion Proceedings of the Web Conference 2020","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the Web Conference 2020","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2004.14294","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2004.14294","pdf_url":"https://arxiv.org/pdf/2004.14294","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3366424.3383547","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3366424.3383547","pdf_url":null,"source":{"id":"https://openalex.org/S4306506651","display_name":"Companion Proceedings of the Web Conference 2020","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the Web Conference 2020","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W1989338554","https://openalex.org/W2007442002","https://openalex.org/W2012575882","https://openalex.org/W2019264297","https://openalex.org/W2040075907","https://openalex.org/W2051141368","https://openalex.org/W2101909884","https://openalex.org/W2120101509","https://openalex.org/W2140208587","https://openalex.org/W2157316480","https://openalex.org/W2158051716","https://openalex.org/W2201534957","https://openalex.org/W2964079897"],"related_works":[],"abstract_inverted_index":{"The":[0],"extraction":[1],"of":[2,45,61,118,152],"main":[3],"content":[4,117],"from":[5,16,65],"web":[6,26,62,102,120,153],"pages":[7,121,154],"is":[8,143],"an":[9],"important":[10],"task":[11],"for":[12,22,48],"numerous":[13],"applications,":[14],"ranging":[15],"usability":[17],"aspects,":[18],"like":[19],"reader":[20],"views":[21],"news":[23],"articles":[24],"in":[25,52,72,100,149],"browsers,":[27],"to":[28,57,109,138,145,147],"information":[29],"retrieval":[30],"or":[31],"natural":[32],"language":[33],"processing.":[34],"Existing":[35],"approaches":[36],"are":[37,55],"lacking":[38],"as":[39,104],"they":[40],"rely":[41,85],"on":[42,86],"large":[43],"amounts":[44],"hand-crafted":[46,88],"features":[47,89],"classification.":[49],"This":[50,106],"results":[51],"models":[53],"that":[54,82,98,140],"tailored":[56],"a":[58,66,77,101,111,133],"specific":[59],"distribution":[60],"pages,":[63],"e.g.":[64],"certain":[67],"time":[68],"frame,":[69],"but":[70,90],"lack":[71],"generalization":[73],"power.":[74],"We":[75],"propose":[76],"neural":[78],"sequence":[79],"labeling":[80],"model":[81,142],"does":[83],"not":[84],"any":[87],"takes":[91],"only":[92],"the":[93,116,124,150,157],"HTML":[94],"tags":[95],"and":[96,155],"words":[97],"appear":[99],"page":[103],"input.":[105],"allows":[107],"us":[108],"present":[110],"browser":[112,125],"extension":[113],"which":[114],"highlights":[115],"arbitrary":[119],"directly":[122],"within":[123],"using":[126],"our":[127,141],"model.":[128,159],"In":[129],"addition,":[130],"we":[131],"create":[132],"new,":[134],"more":[135],"current":[136],"dataset":[137],"show":[139],"able":[144],"adapt":[146],"changes":[148],"structure":[151],"outperform":[156],"state-of-the-art":[158]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":6},{"year":2022,"cited_by_count":6},{"year":2021,"cited_by_count":2}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2020-05-13T00:00:00"}
