{"id":"https://openalex.org/W1975700640","doi":"https://doi.org/10.1145/2034691.2034720","title":"An efficient language-independent method to extract content from news webpages","display_name":"An efficient language-independent method to extract content from news webpages","publication_year":2011,"publication_date":"2011-09-19","ids":{"openalex":"https://openalex.org/W1975700640","doi":"https://doi.org/10.1145/2034691.2034720","mag":"1975700640"},"language":"en","primary_location":{"id":"doi:10.1145/2034691.2034720","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2034691.2034720","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 11th ACM symposium on Document engineering","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110718957","display_name":"Eduardo Cardoso","orcid":"https://orcid.org/0009-0006-2566-0090"},"institutions":[{"id":"https://openalex.org/I2699952","display_name":"Pontifical Catholic University of Rio de Janeiro","ror":"https://ror.org/01dg47b60","country_code":"BR","type":"education","lineage":["https://openalex.org/I2699952"]}],"countries":["BR"],"is_corresponding":true,"raw_author_name":"Eduardo Cardoso","raw_affiliation_strings":["PUC-Rio, Rio de Janeiro, Brazil","PUC-RIO, Rio de Janeiro, Brazil"],"affiliations":[{"raw_affiliation_string":"PUC-Rio, Rio de Janeiro, Brazil","institution_ids":["https://openalex.org/I2699952"]},{"raw_affiliation_string":"PUC-RIO, Rio de Janeiro, Brazil","institution_ids":["https://openalex.org/I2699952"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073498166","display_name":"Iam Vita Jabour","orcid":null},"institutions":[{"id":"https://openalex.org/I2699952","display_name":"Pontifical Catholic University of Rio de Janeiro","ror":"https://ror.org/01dg47b60","country_code":"BR","type":"education","lineage":["https://openalex.org/I2699952"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Iam Jabour","raw_affiliation_strings":["PUC-Rio, Rio de Janeiro, Brazil","PUC-RIO, Rio de Janeiro, Brazil"],"affiliations":[{"raw_affiliation_string":"PUC-Rio, Rio de Janeiro, Brazil","institution_ids":["https://openalex.org/I2699952"]},{"raw_affiliation_string":"PUC-RIO, Rio de Janeiro, Brazil","institution_ids":["https://openalex.org/I2699952"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073527139","display_name":"Eduardo Sany Laber","orcid":"https://orcid.org/0000-0002-9025-8333"},"institutions":[{"id":"https://openalex.org/I2699952","display_name":"Pontifical Catholic University of Rio de Janeiro","ror":"https://ror.org/01dg47b60","country_code":"BR","type":"education","lineage":["https://openalex.org/I2699952"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Eduardo Laber","raw_affiliation_strings":["PUC-Rio, Rio de Janeiro, Brazil","PUC-RIO, Rio de Janeiro, Brazil"],"affiliations":[{"raw_affiliation_string":"PUC-Rio, Rio de Janeiro, Brazil","institution_ids":["https://openalex.org/I2699952"]},{"raw_affiliation_string":"PUC-RIO, Rio de Janeiro, Brazil","institution_ids":["https://openalex.org/I2699952"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100907867","display_name":"Rog\u00e9rio Rodrigues","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rog\u00e9rio Rodrigues","raw_affiliation_strings":["Microsoft Corporation, Rio de Janeiro, Brazil"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation, Rio de Janeiro, Brazil","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5053989334","display_name":"Pedro Cardoso","orcid":"https://orcid.org/0000-0001-8546-4194"},"institutions":[{"id":"https://openalex.org/I2699952","display_name":"Pontifical Catholic University of Rio de Janeiro","ror":"https://ror.org/01dg47b60","country_code":"BR","type":"education","lineage":["https://openalex.org/I2699952"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Pedro Cardoso","raw_affiliation_strings":["PUC-Rio, Rio de Janeiro, Brazil","PUC-RIO, Rio de Janeiro, Brazil"],"affiliations":[{"raw_affiliation_string":"PUC-Rio, Rio de Janeiro, Brazil","institution_ids":["https://openalex.org/I2699952"]},{"raw_affiliation_string":"PUC-RIO, Rio de Janeiro, Brazil","institution_ids":["https://openalex.org/I2699952"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5110718957"],"corresponding_institution_ids":["https://openalex.org/I2699952"],"apc_list":null,"apc_paid":null,"fwci":2.2168,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.90259486,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"121","last_page":"128"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9944000244140625,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.834797739982605},{"id":"https://openalex.org/keywords/rendering","display_name":"Rendering (computer graphics)","score":0.7921229004859924},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.7671575546264648},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5847253799438477},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.49039363861083984},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.38204801082611084},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.22461336851119995}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.834797739982605},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.7921229004859924},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.7671575546264648},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5847253799438477},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.49039363861083984},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38204801082611084},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.22461336851119995}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2034691.2034720","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2034691.2034720","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 11th ACM symposium on Document engineering","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.800000011920929}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W1938740620","https://openalex.org/W1991565788","https://openalex.org/W1992657934","https://openalex.org/W2051141368","https://openalex.org/W2072489225","https://openalex.org/W2078951877","https://openalex.org/W2090193136","https://openalex.org/W2110606009","https://openalex.org/W2133990480","https://openalex.org/W2140208587","https://openalex.org/W2145076056","https://openalex.org/W2152056009","https://openalex.org/W2165094554","https://openalex.org/W2482589566","https://openalex.org/W2495199414","https://openalex.org/W6633894697","https://openalex.org/W6676463157"],"related_works":["https://openalex.org/W2357241418","https://openalex.org/W2086064646","https://openalex.org/W2119135658","https://openalex.org/W2115485936","https://openalex.org/W2153015554","https://openalex.org/W3022131925","https://openalex.org/W2394327295","https://openalex.org/W2160502612","https://openalex.org/W2358941527","https://openalex.org/W2329972207"],"abstract_inverted_index":{"We":[0,40],"tackle":[1],"the":[2,10,25,63,101,108,111],"task":[3],"of":[4,28,48,70,110,126,140],"news":[5,11],"webpage":[6,32],"segmentation,":[7],"specifically":[8],"identifying":[9],"title,":[12],"publication":[13],"date":[14],"and":[15,82,107],"story":[16],"body.":[17],"While":[18],"there":[19],"are":[20,97],"very":[21,37],"good":[22,138],"results":[23,113],"in":[24,62,100],"literature,":[26,102],"most":[27],"them":[29],"rely":[30],"on":[31,42],"rendering,":[33,81],"which":[34],"is":[35,52],"a":[36,45,53,76,137],"time-consuming":[38],"step.":[39],"focus":[41],"scenarios":[43],"with":[44,68,75],"high":[46],"volume":[47],"documents,":[49],"where":[50],"performance":[51],"must.":[54],"The":[55],"chosen":[56],"approach":[57,118],"extends":[58],"our":[59,87],"previous":[60],"work":[61],"area,":[64],"combining":[65],"structural":[66],"properties":[67],"hints":[69],"visual":[71],"presentation":[72],"styles,":[73],"computed":[74],"quicker":[77],"method":[78],"than":[79,129],"regular":[80],"machine":[83],"learning":[84],"algorithms.":[85],"In":[86],"experiments,":[88],"we":[89],"took":[90],"special":[91],"attention":[92],"to":[93,121],"some":[94],"aspects":[95],"that":[96],"often":[98],"overlooked":[99],"such":[103],"as":[104],"processing":[105],"time":[106],"generalization":[109],"extraction":[112],"for":[114],"unseen":[115],"domains.":[116],"Our":[117],"has":[119],"shown":[120],"be":[122],"about":[123],"an":[124,130],"order":[125],"magnitude":[127],"faster":[128],"equivalent":[131],"full":[132],"rendering":[133],"alternative":[134],"while":[135],"retaining":[136],"quality":[139],"extraction.":[141]},"counts_by_year":[{"year":2023,"cited_by_count":2},{"year":2021,"cited_by_count":1},{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":1},{"year":2016,"cited_by_count":5},{"year":2015,"cited_by_count":1},{"year":2013,"cited_by_count":1},{"year":2012,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
