{"id":"https://openalex.org/W4226379087","doi":"https://doi.org/10.1145/3486622.3493938","title":"Multi-Task Neural Sequence Labeling for Zero-Shot Cross-Language Boilerplate Removal","display_name":"Multi-Task Neural Sequence Labeling for Zero-Shot Cross-Language Boilerplate Removal","publication_year":2021,"publication_date":"2021-12-14","ids":{"openalex":"https://openalex.org/W4226379087","doi":"https://doi.org/10.1145/3486622.3493938"},"language":"en","primary_location":{"id":"doi:10.1145/3486622.3493938","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3486622.3493938","pdf_url":null,"source":{"id":"https://openalex.org/S4363608074","display_name":"IEEE/WIC/ACM International Conference on Web Intelligence","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/WIC/ACM International Conference on Web Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100731116","display_name":"Yuhao Wu","orcid":"https://orcid.org/0000-0001-9155-3484"},"institutions":[{"id":"https://openalex.org/I22265921","display_name":"National Central University","ror":"https://ror.org/00944ve71","country_code":"TW","type":"education","lineage":["https://openalex.org/I22265921"]}],"countries":["TW"],"is_corresponding":true,"raw_author_name":"Yu-Hao Wu","raw_affiliation_strings":["National Central University, Taiwan"],"affiliations":[{"raw_affiliation_string":"National Central University, Taiwan","institution_ids":["https://openalex.org/I22265921"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5078524542","display_name":"Chia\u2010Hui Chang","orcid":"https://orcid.org/0000-0002-1101-6337"},"institutions":[{"id":"https://openalex.org/I22265921","display_name":"National Central University","ror":"https://ror.org/00944ve71","country_code":"TW","type":"education","lineage":["https://openalex.org/I22265921"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Chia-Hui Chang","raw_affiliation_strings":["National Central University, Taiwan"],"affiliations":[{"raw_affiliation_string":"National Central University, Taiwan","institution_ids":["https://openalex.org/I22265921"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5100731116"],"corresponding_institution_ids":["https://openalex.org/I22265921"],"apc_list":null,"apc_paid":null,"fwci":0.2836,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.5542146,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9675999879837036,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9399999976158142,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/boilerplate-text","display_name":"Boilerplate text","score":0.8941380977630615},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5577911734580994},{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.5287710428237915},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.4718955457210541},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.4587024450302124},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.45731139183044434},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.38114041090011597},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.32241514325141907},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.1539929211139679},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.1417531967163086},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.10324835777282715}],"concepts":[{"id":"https://openalex.org/C75701414","wikidata":"https://www.wikidata.org/wiki/Q1651672","display_name":"Boilerplate text","level":2,"score":0.8941380977630615},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5577911734580994},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.5287710428237915},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.4718955457210541},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.4587024450302124},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.45731139183044434},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38114041090011597},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32241514325141907},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.1539929211139679},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.1417531967163086},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.10324835777282715},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3486622.3493938","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3486622.3493938","pdf_url":null,"source":{"id":"https://openalex.org/S4363608074","display_name":"IEEE/WIC/ACM International Conference on Web Intelligence","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/WIC/ACM International Conference on Web Intelligence","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1311528792","display_name":null,"funder_award_id":"MOST-109-2221-E-008-060-MY3","funder_id":"https://openalex.org/F4320322795","funder_display_name":"Ministry of Science and Technology, Taiwan"}],"funders":[{"id":"https://openalex.org/F4320322795","display_name":"Ministry of Science and Technology, Taiwan","ror":"https://ror.org/02kv4zf79"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W2005646337","https://openalex.org/W2012575882","https://openalex.org/W2016465393","https://openalex.org/W2042448356","https://openalex.org/W2061760029","https://openalex.org/W2064675550","https://openalex.org/W2108223890","https://openalex.org/W2120101509","https://openalex.org/W2134150392","https://openalex.org/W2148317291","https://openalex.org/W2157316480","https://openalex.org/W2467373585","https://openalex.org/W2602331152","https://openalex.org/W2781528640","https://openalex.org/W2808294523","https://openalex.org/W2918008835","https://openalex.org/W2963341956","https://openalex.org/W2964079897","https://openalex.org/W2982150889","https://openalex.org/W3021347373","https://openalex.org/W3085495757","https://openalex.org/W3102354094","https://openalex.org/W4297969478","https://openalex.org/W6637618735"],"related_works":["https://openalex.org/W149980","https://openalex.org/W354571","https://openalex.org/W868042","https://openalex.org/W1243554","https://openalex.org/W706006","https://openalex.org/W864442","https://openalex.org/W1383942","https://openalex.org/W16296291","https://openalex.org/W1716356","https://openalex.org/W17976389"],"abstract_inverted_index":{"Although":[0],"web":[1,46,55,124,177],"pages":[2,125,178],"are":[3,8,22],"rich":[4],"in":[5,77,122,127,252],"resources,":[6],"they":[7],"usually":[9],"intertwined":[10],"with":[11,175,236],"advertisements,":[12],"banners,":[13],"navigation":[14],"bars,":[15],"footer":[16],"copyrights":[17],"and":[18,41,146,189,230],"other":[19],"templates,":[20],"which":[21,94,123,244],"often":[23],"not":[24,96],"of":[25,36,101,240,248],"interest":[26],"to":[27,52,72,173],"users.":[28],"In":[29,104,130],"this":[30,131],"paper,":[31,132],"we":[32,133,160],"study":[33],"the":[34,38,82,86,99,106,205,216,237,241,246],"problem":[35],"extracting":[37],"main":[39,61],"content":[40,114,171],"removing":[42],"irrelevant":[43],"information":[44],"from":[45,166],"pages.":[47],"The":[48,182],"common":[49],"solution":[50],"is":[51],"classify":[53],"each":[54],"component":[56],"into":[57],"boilerplate":[58],"(noise)":[59],"or":[60],"content.":[62],"State-of-the-art":[63],"approaches":[64],"such":[65],"as":[66,91],"BoilerNet":[67,199],"use":[68],"neural":[69],"sequence":[70],"labeling":[71],"achieve":[73,193],"an":[74],"impressive":[75],"score":[76],"CleanEval":[78,201],"EN":[79,202,219],"dataset.":[80],"However,":[81],"model":[83],"uses":[84],"only":[85],"top":[87],"50":[88],"HTML":[89,151,186],"tags":[90],"input":[92],"features,":[93],"does":[95],"fully":[97],"utilize":[98],"power":[100],"tag":[102,152,155,187],"information.":[103],"addition,":[105],"most":[107],"frequent":[108],"1,000":[109],"words":[110],"used":[111],"for":[112,154,169],"text":[113,170,207],"representation":[115,157,172,209],"cannot":[116],"effectively":[117],"support":[118,251],"a":[119,135,233],"real-world":[120],"environment":[121],"appear":[126],"multiple":[128],"languages.":[129],"propose":[134],"multi-task":[136,190],"learning":[137,191],"framework":[138],"based":[139,210],"on":[140,200,211,218,225],"two":[141],"auxiliary":[142],"tasks:":[143],"depth":[144],"prediction":[145],"position":[147],"prediction.":[148],"We":[149],"explore":[150],"embedding":[153,188],"path":[156],"learning.":[158],"Further,":[159],"employ":[161],"multilingual":[162,212],"Bidirectional":[163],"Encoder":[164],"Representations":[165],"Transformers":[167],"(BERT)":[168],"deal":[174],"any":[176],"without":[179],"language":[180],"limitations.":[181],"experiments":[183,224],"show":[184],"that":[185],"frameworks":[192],"much":[194],"higher":[195],"scores":[196],"than":[197],"using":[198],"datasets.":[203],"Secondly,":[204],"pre-trained":[206],"block":[208],"BERT":[213],"will":[214],"degrade":[215],"performance":[217,234],"test":[220],"sets;":[221],"however,":[222],"zero-shot":[223],"three":[226],"languages":[227],"(Chinese,":[228],"Japanese,":[229],"Thai)":[231],"have":[232],"consistent":[235],"five-fold":[238],"cross-validation":[239],"respective":[242],"language,":[243],"indicates":[245],"possibility":[247],"providing":[249],"cross-lingual":[250],"one":[253],"model.":[254]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
