{"id":"https://openalex.org/W2890174976","doi":"https://doi.org/10.18653/v1/d18-1099","title":"Supervised and Unsupervised Methods for Robust Separation of Section Titles and Prose Text in Web Documents","display_name":"Supervised and Unsupervised Methods for Robust Separation of Section Titles and Prose Text in Web Documents","publication_year":2018,"publication_date":"2018-01-01","ids":{"openalex":"https://openalex.org/W2890174976","doi":"https://doi.org/10.18653/v1/d18-1099","mag":"2890174976"},"language":"en","primary_location":{"id":"doi:10.18653/v1/d18-1099","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/d18-1099","pdf_url":"https://www.aclweb.org/anthology/D18-1099.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.aclweb.org/anthology/D18-1099.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061450676","display_name":"Abhijith Athreya Mysore Gopinath","orcid":null},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Abhijith Athreya Mysore Gopinath","raw_affiliation_strings":["College of IST, Pennsylvania State University, University Park, PA 16802, USA"],"affiliations":[{"raw_affiliation_string":"College of IST, Pennsylvania State University, University Park, PA 16802, USA","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056978982","display_name":"Shomir Wilson","orcid":"https://orcid.org/0000-0003-1235-3754"},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shomir Wilson","raw_affiliation_strings":["College of IST, Pennsylvania State University, University Park, PA 16802, USA"],"affiliations":[{"raw_affiliation_string":"College of IST, Pennsylvania State University, University Park, PA 16802, USA","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5081563886","display_name":"Norman Sadeh","orcid":"https://orcid.org/0000-0003-4829-5533"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Norman Sadeh","raw_affiliation_strings":["School of Computer Science, Carnegie Mellon University, Pittsburgh, PA 15213, USA"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Carnegie Mellon University, Pittsburgh, PA 15213, USA","institution_ids":["https://openalex.org/I74973139"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5061450676"],"corresponding_institution_ids":["https://openalex.org/I130769515"],"apc_list":null,"apc_paid":null,"fwci":2.0309,"has_fulltext":true,"cited_by_count":19,"citation_normalized_percentile":{"value":0.90006894,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"850","last_page":"855"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8331743478775024},{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.7558357119560242},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.7035067081451416},{"id":"https://openalex.org/keywords/section","display_name":"Section (typography)","score":0.5883104205131531},{"id":"https://openalex.org/keywords/precision-and-recall","display_name":"Precision and recall","score":0.5807347297668457},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.5265489220619202},{"id":"https://openalex.org/keywords/html-element","display_name":"HTML element","score":0.5165452361106873},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5097853541374207},{"id":"https://openalex.org/keywords/syntax","display_name":"Syntax","score":0.5046361684799194},{"id":"https://openalex.org/keywords/markup-language","display_name":"Markup language","score":0.4905814230442047},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.4767549932003021},{"id":"https://openalex.org/keywords/hierarchy","display_name":"Hierarchy","score":0.43364158272743225},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4190959930419922},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.4186363220214844},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3916208744049072},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.31274542212486267},{"id":"https://openalex.org/keywords/xml","display_name":"XML","score":0.22804030776023865}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8331743478775024},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.7558357119560242},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.7035067081451416},{"id":"https://openalex.org/C2780129039","wikidata":"https://www.wikidata.org/wiki/Q1931107","display_name":"Section (typography)","level":2,"score":0.5883104205131531},{"id":"https://openalex.org/C81669768","wikidata":"https://www.wikidata.org/wiki/Q2359161","display_name":"Precision and recall","level":2,"score":0.5807347297668457},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.5265489220619202},{"id":"https://openalex.org/C81639021","wikidata":"https://www.wikidata.org/wiki/Q179551","display_name":"HTML element","level":3,"score":0.5165452361106873},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5097853541374207},{"id":"https://openalex.org/C60048249","wikidata":"https://www.wikidata.org/wiki/Q37437","display_name":"Syntax","level":2,"score":0.5046361684799194},{"id":"https://openalex.org/C45874996","wikidata":"https://www.wikidata.org/wiki/Q37045","display_name":"Markup language","level":3,"score":0.4905814230442047},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.4767549932003021},{"id":"https://openalex.org/C31170391","wikidata":"https://www.wikidata.org/wiki/Q188619","display_name":"Hierarchy","level":2,"score":0.43364158272743225},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4190959930419922},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.4186363220214844},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3916208744049072},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.31274542212486267},{"id":"https://openalex.org/C8797682","wikidata":"https://www.wikidata.org/wiki/Q2115","display_name":"XML","level":2,"score":0.22804030776023865},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C34447519","wikidata":"https://www.wikidata.org/wiki/Q179522","display_name":"Market economy","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/d18-1099","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/d18-1099","pdf_url":"https://www.aclweb.org/anthology/D18-1099.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/d18-1099","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/d18-1099","pdf_url":"https://www.aclweb.org/anthology/D18-1099.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6299999952316284,"id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G848032724","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2890174976.pdf","grobid_xml":"https://content.openalex.org/works/W2890174976.grobid-xml"},"referenced_works_count":17,"referenced_works":["https://openalex.org/W168564468","https://openalex.org/W205532704","https://openalex.org/W757417731","https://openalex.org/W1496244604","https://openalex.org/W1553229631","https://openalex.org/W1567298333","https://openalex.org/W1675962973","https://openalex.org/W2080132606","https://openalex.org/W2101234009","https://openalex.org/W2123442489","https://openalex.org/W2133990480","https://openalex.org/W2250601901","https://openalex.org/W2252264672","https://openalex.org/W2271840356","https://openalex.org/W2402929825","https://openalex.org/W2517394750","https://openalex.org/W2962837431"],"related_works":["https://openalex.org/W2003578783","https://openalex.org/W4234466702","https://openalex.org/W2362437884","https://openalex.org/W2004087619","https://openalex.org/W4321472216","https://openalex.org/W2557094866","https://openalex.org/W3140094074","https://openalex.org/W3121219210","https://openalex.org/W2469016277","https://openalex.org/W627285271"],"abstract_inverted_index":{"The":[0,136],"text":[1,37,106],"in":[2,52,58,86],"many":[3],"web":[4,81,120],"documents":[5,82],"is":[6,33,42],"organized":[7],"into":[8,107],"a":[9,18,53,67,132,153],"hierarchy":[10],"of":[11,55,80,84,119,129,134,147,156],"section":[12,75,108],"titles":[13,76,109],"and":[14,28,39,77,97,110,131,165],"corresponding":[15],"prose":[16,78,111],"content,":[17],"structure":[19,27],"which":[20,103],"provides":[21],"potentially":[22],"exploitable":[23],"information":[24,163],"on":[25,115],"discourse":[26,96],"topicality.":[29],"However,":[30],"this":[31],"organization":[32,48,79],"generally":[34],"discarded":[35],"during":[36],"collection,":[38],"collecting":[40],"it":[41],"not":[43],"straightforward:":[44],"the":[45,59,73,145],"same":[46],"visual":[47],"can":[49],"be":[50],"implemented":[51],"myriad":[54],"different":[56,117],"ways":[57],"underlying":[60],"HTML.":[61],"To":[62],"remedy":[63],"this,":[64],"we":[65],"present":[66],"flexible":[68],"system":[69,90,124],"for":[70,159],"automatically":[71],"extracting":[72],"hierarchical":[74],"irrespective":[83],"differences":[85],"HTML":[87,105],"representation.":[88],"This":[89],"uses":[91],"features":[92],"from":[93],"syntax,":[94],"semantics,":[95],"markup":[98],"to":[99],"build":[100],"two":[101],"models":[102],"classify":[104],"text.":[112],"When":[113],"tested":[114],"three":[116],"domains":[118],"text,":[121],"our":[122],"domainindependent":[123],"achieves":[125],"an":[126],"overall":[127],"precision":[128,142],"0.82":[130],"recall":[133,148],"0.98.":[135],"domaindependent":[137],"variation":[138],"produces":[139],"very":[140],"high":[141],"(0.99)":[143],"at":[144],"expense":[146],"(0.75).":[149],"These":[150],"results":[151],"exhibit":[152],"robust":[154],"level":[155],"accuracy":[157],"suitable":[158],"enhancing":[160],"question":[161],"answering,":[162],"extraction,":[164],"summarization.":[166],"1":[167]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":5},{"year":2019,"cited_by_count":3}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
