{"id":"https://openalex.org/W1970369278","doi":"https://doi.org/10.1145/2509558.2509576","title":"Unsupervised discovery and extraction of semi-structured regions in text via self-information","display_name":"Unsupervised discovery and extraction of semi-structured regions in text via self-information","publication_year":2013,"publication_date":"2013-10-27","ids":{"openalex":"https://openalex.org/W1970369278","doi":"https://doi.org/10.1145/2509558.2509576","mag":"1970369278"},"language":"en","primary_location":{"id":"doi:10.1145/2509558.2509576","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2509558.2509576","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2013 workshop on Automated knowledge base construction","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5026603897","display_name":"Eric Yeh","orcid":"https://orcid.org/0000-0001-8752-4429"},"institutions":[{"id":"https://openalex.org/I1298353152","display_name":"SRI International","ror":"https://ror.org/05s570m15","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I1298353152"]},{"id":"https://openalex.org/I4210099336","display_name":"Menlo School","ror":"https://ror.org/01240pn49","country_code":"US","type":"education","lineage":["https://openalex.org/I4210099336"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Eric Yeh","raw_affiliation_strings":["SRI International, Menlo Park, CA, USA","SRI-International, Menlo Park, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SRI International, Menlo Park, CA, USA","institution_ids":["https://openalex.org/I1298353152","https://openalex.org/I4210099336"]},{"raw_affiliation_string":"SRI-International, Menlo Park, CA, USA","institution_ids":["https://openalex.org/I1298353152","https://openalex.org/I4210099336"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044008629","display_name":"John Niekrasz","orcid":null},"institutions":[{"id":"https://openalex.org/I1298353152","display_name":"SRI International","ror":"https://ror.org/05s570m15","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I1298353152"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"John Niekrasz","raw_affiliation_strings":["SRI International, San Diego, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SRI International, San Diego, CA, USA","institution_ids":["https://openalex.org/I1298353152"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5080437721","display_name":"Dayne Freitag","orcid":"https://orcid.org/0009-0001-0016-5097"},"institutions":[{"id":"https://openalex.org/I1298353152","display_name":"SRI International","ror":"https://ror.org/05s570m15","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I1298353152"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dayne Freitag","raw_affiliation_strings":["SRI International, San Diego, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SRI International, San Diego, CA, USA","institution_ids":["https://openalex.org/I1298353152"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.8362,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.8182333,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"103","last_page":"108"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9929999709129333,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9865999817848206,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7886923551559448},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6588187217712402},{"id":"https://openalex.org/keywords/schema","display_name":"Schema (genetic algorithms)","score":0.6456494331359863},{"id":"https://openalex.org/keywords/presentational-and-representational-acting","display_name":"Presentational and representational acting","score":0.6358027458190918},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.5919550061225891},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.5637891292572021},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.561767578125},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5422881245613098},{"id":"https://openalex.org/keywords/measure","display_name":"Measure (data warehouse)","score":0.4738995134830475},{"id":"https://openalex.org/keywords/document-structure-description","display_name":"Document Structure Description","score":0.42641162872314453},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.2153528332710266},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.13722041249275208},{"id":"https://openalex.org/keywords/xml","display_name":"XML","score":0.12532129883766174}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7886923551559448},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6588187217712402},{"id":"https://openalex.org/C52146309","wikidata":"https://www.wikidata.org/wiki/Q7431116","display_name":"Schema (genetic algorithms)","level":2,"score":0.6456494331359863},{"id":"https://openalex.org/C74039378","wikidata":"https://www.wikidata.org/wiki/Q7240997","display_name":"Presentational and representational acting","level":2,"score":0.6358027458190918},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.5919550061225891},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.5637891292572021},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.561767578125},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5422881245613098},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.4738995134830475},{"id":"https://openalex.org/C68699486","wikidata":"https://www.wikidata.org/wiki/Q265904","display_name":"Document Structure Description","level":3,"score":0.42641162872314453},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2153528332710266},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.13722041249275208},{"id":"https://openalex.org/C8797682","wikidata":"https://www.wikidata.org/wiki/Q2115","display_name":"XML","level":2,"score":0.12532129883766174},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2509558.2509576","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2509558.2509576","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2013 workshop on Automated knowledge base construction","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.8199999928474426,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":10,"referenced_works":["https://openalex.org/W1574901103","https://openalex.org/W1982280055","https://openalex.org/W1996430422","https://openalex.org/W2009659675","https://openalex.org/W2034797903","https://openalex.org/W2077630735","https://openalex.org/W2155869763","https://openalex.org/W2162340487","https://openalex.org/W3044706601","https://openalex.org/W4244046749"],"related_works":["https://openalex.org/W436149066","https://openalex.org/W2354188475","https://openalex.org/W1999666114","https://openalex.org/W2358047786","https://openalex.org/W2379663001","https://openalex.org/W2081534890","https://openalex.org/W2468279273","https://openalex.org/W3011615320","https://openalex.org/W1986883381","https://openalex.org/W2296700935"],"abstract_inverted_index":{"We":[0,188,227],"describe":[1,189],"a":[2,17,80,90,122,159,171,208,243],"general":[3],"method":[4],"for":[5,66,98],"identifying":[6,99],"and":[7,30,38,131,185,214,240],"extracting":[8],"information":[9,24,68],"from":[10,178,211],"semi-structured":[11,100,117,153,205],"regions":[12,22,62,101,118,154],"of":[13,34,41,53,56,93,126,148,164,182,192,200,245],"text":[14,206],"embedded":[15],"within":[16],"natural":[18],"language":[19],"document.":[20,113],"These":[21],"encode":[23,156],"according":[25],"to":[26,88,138,146,196,203,236],"ad":[27],"hoc":[28],"schemas":[29],"visual":[31,132,186],"cues,":[32],"instead":[33],"using":[35],"the":[36,112,124,190,198,212],"grammatical":[37],"presentational":[39],"conventions":[40],"normal":[42],"sentential":[43,142],"language.":[44,143],"Examples":[45],"include":[46],"tables,":[47],"key-value":[48],"listings,":[49],"or":[50,162],"repeated":[51],"enumerations":[52],"properties.":[54],"Because":[55],"their":[57],"generally":[58],"non-sentential":[59],"nature,":[60],"these":[61,149,201,233],"can":[63],"present":[64,170,228],"problems":[65],"standard":[67],"extraction":[69,238],"algorithms.":[70],"Unlike":[71],"previous":[72],"work":[73,230],"in":[74,116,141,207],"table":[75],"extraction,":[76],"which":[77],"relies":[78],"on":[79,107,174,222],"relatively":[81],"noiseless":[82],"two-dimensional":[83],"layout,":[84],"our":[85],"aim":[86],"is":[87,102],"accommodate":[89],"wide":[91],"variety":[92],"structure":[94],"types.":[95],"Our":[96],"approach":[97],"an":[103,193,223],"unsupervised":[104],"one,":[105],"based":[106,173],"scoring":[108],"unusual":[109,136,150],"regularity":[110],"inside":[111],"As":[114],"content":[115,130],"are":[119],"governed":[120],"by":[121],"schema,":[123],"occurrence":[125],"features":[127],"encompassing":[128],"textual":[129,183],"appearance":[133],"would":[134],"be":[135],"compared":[137],"those":[139],"seen":[140],"Regularity":[144],"refers":[145],"repetition":[147],"features,":[151],"as":[152],"commonly":[155],"more":[157],"than":[158],"single":[160],"row":[161],"group":[163],"information.":[165],"To":[166],"score":[167],"this,":[168],"we":[169],"measure":[172,218],"expected":[175],"self-information,":[176],"derived":[177],"statistics":[179],"over":[180],"patterns":[181,235],"categories":[184],"layout.":[187],"results":[191],"initial":[194,229],"study":[195],"assess":[197],"ability":[199],"measures":[202],"detect":[204],"corpus":[209],"culled":[210],"web,":[213],"show":[215],"that":[216,231],"this":[217],"outperform":[219],"baseline":[220],"methods":[221],"average":[224],"precision":[225],"measure.":[226],"uses":[232],"significant":[234],"generate":[237],"rules,":[239],"conclude":[241],"with":[242],"discussion":[244],"future":[246],"directions.":[247]},"counts_by_year":[{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":1},{"year":2016,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
