{"id":"https://openalex.org/W2440094130","doi":"https://doi.org/10.1145/2882903.2904442","title":"Extracting Databases from Dark Data with DeepDive","display_name":"Extracting Databases from Dark Data with DeepDive","publication_year":2016,"publication_date":"2016-06-14","ids":{"openalex":"https://openalex.org/W2440094130","doi":"https://doi.org/10.1145/2882903.2904442","mag":"2440094130","pmid":"https://pubmed.ncbi.nlm.nih.gov/28316365"},"language":"en","primary_location":{"id":"doi:10.1145/2882903.2904442","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2882903.2904442","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2016 International Conference on Management of Data","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/5350112","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100383731","display_name":"Ce Zhang","orcid":"https://orcid.org/0000-0002-8105-7505"},"institutions":[{"id":"https://openalex.org/I1743320","display_name":"Palo Alto University","ror":"https://ror.org/04f812k67","country_code":"US","type":"education","lineage":["https://openalex.org/I1743320"]},{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ce Zhang","raw_affiliation_strings":["Stanford University, Palo Alto, CA, USA"],"affiliations":[{"raw_affiliation_string":"Stanford University, Palo Alto, CA, USA","institution_ids":["https://openalex.org/I1743320","https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101727253","display_name":"Jaeho Shin","orcid":"https://orcid.org/0000-0001-5280-3356"},"institutions":[{"id":"https://openalex.org/I1743320","display_name":"Palo Alto University","ror":"https://ror.org/04f812k67","country_code":"US","type":"education","lineage":["https://openalex.org/I1743320"]},{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jaeho Shin","raw_affiliation_strings":["Stanford University, Palo Alto, CA, USA"],"affiliations":[{"raw_affiliation_string":"Stanford University, Palo Alto, CA, USA","institution_ids":["https://openalex.org/I1743320","https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103852640","display_name":"Christopher R\u00e9","orcid":null},"institutions":[{"id":"https://openalex.org/I1743320","display_name":"Palo Alto University","ror":"https://ror.org/04f812k67","country_code":"US","type":"education","lineage":["https://openalex.org/I1743320"]},{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Christopher R\u00e9","raw_affiliation_strings":["Stanford University, Palo Alto, CA, USA"],"affiliations":[{"raw_affiliation_string":"Stanford University, Palo Alto, CA, USA","institution_ids":["https://openalex.org/I1743320","https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039133265","display_name":"Michael Cafarella","orcid":"https://orcid.org/0000-0001-6122-0590"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Michael Cafarella","raw_affiliation_strings":["University of Michigan, Ann Arbor, MI, USA"],"affiliations":[{"raw_affiliation_string":"University of Michigan, Ann Arbor, MI, USA","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5039949603","display_name":"Feng Niu","orcid":"https://orcid.org/0000-0003-2465-647X"},"institutions":[{"id":"https://openalex.org/I116921496","display_name":"Lattice Semiconductor (United States)","ror":"https://ror.org/01hght844","country_code":"US","type":"company","lineage":["https://openalex.org/I116921496"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Feng Niu","raw_affiliation_strings":["Lattice Data, Inc., Palo Alto, CA, USA"],"affiliations":[{"raw_affiliation_string":"Lattice Data, Inc., Palo Alto, CA, USA","institution_ids":["https://openalex.org/I116921496"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100383731"],"corresponding_institution_ids":["https://openalex.org/I1743320","https://openalex.org/I97018004"],"apc_list":null,"apc_paid":null,"fwci":10.6647,"has_fulltext":false,"cited_by_count":47,"citation_normalized_percentile":{"value":0.98233066,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":"2016","issue":null,"first_page":"847","last_page":"859"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9901000261306763,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7873625755310059},{"id":"https://openalex.org/keywords/relational-database","display_name":"Relational database","score":0.749911904335022},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5982358455657959},{"id":"https://openalex.org/keywords/relational-database-management-system","display_name":"Relational database management system","score":0.566270112991333},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5253188610076904},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.5158664584159851},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.49516716599464417},{"id":"https://openalex.org/keywords/deep-web","display_name":"Deep Web","score":0.44082069396972656},{"id":"https://openalex.org/keywords/database-model","display_name":"Database model","score":0.43737170100212097},{"id":"https://openalex.org/keywords/database-design","display_name":"Database design","score":0.43445074558258057},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.4033740758895874},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.31780165433883667},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.2819443643093109},{"id":"https://openalex.org/keywords/the-internet","display_name":"The Internet","score":0.1150963306427002}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7873625755310059},{"id":"https://openalex.org/C5655090","wikidata":"https://www.wikidata.org/wiki/Q192588","display_name":"Relational database","level":2,"score":0.749911904335022},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5982358455657959},{"id":"https://openalex.org/C24394798","wikidata":"https://www.wikidata.org/wiki/Q192588","display_name":"Relational database management system","level":3,"score":0.566270112991333},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5253188610076904},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.5158664584159851},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.49516716599464417},{"id":"https://openalex.org/C46721378","wikidata":"https://www.wikidata.org/wiki/Q221989","display_name":"Deep Web","level":3,"score":0.44082069396972656},{"id":"https://openalex.org/C5968703","wikidata":"https://www.wikidata.org/wiki/Q267136","display_name":"Database model","level":3,"score":0.43737170100212097},{"id":"https://openalex.org/C148840519","wikidata":"https://www.wikidata.org/wiki/Q1049878","display_name":"Database design","level":2,"score":0.43445074558258057},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.4033740758895874},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.31780165433883667},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2819443643093109},{"id":"https://openalex.org/C110875604","wikidata":"https://www.wikidata.org/wiki/Q75","display_name":"The Internet","level":2,"score":0.1150963306427002},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/2882903.2904442","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2882903.2904442","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2016 International Conference on Management of Data","raw_type":"proceedings-article"},{"id":"pmid:28316365","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/28316365","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings. ACM-SIGMOD International Conference on Management of Data","raw_type":null},{"id":"pmh:oai:pubmedcentral.nih.gov:5350112","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/5350112","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Proc ACM SIGMOD Int Conf Manag Data","raw_type":"Text"}],"best_oa_location":{"id":"pmh:oai:pubmedcentral.nih.gov:5350112","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/5350112","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Proc ACM SIGMOD Int Conf Manag Data","raw_type":"Text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":71,"referenced_works":["https://openalex.org/W104683736","https://openalex.org/W130710483","https://openalex.org/W314565566","https://openalex.org/W1435924991","https://openalex.org/W1489949474","https://openalex.org/W1493490255","https://openalex.org/W1512387364","https://openalex.org/W1515300998","https://openalex.org/W1565102206","https://openalex.org/W1599188306","https://openalex.org/W1604644367","https://openalex.org/W1788418780","https://openalex.org/W1900340173","https://openalex.org/W1954715867","https://openalex.org/W1965685479","https://openalex.org/W1985093013","https://openalex.org/W2006149654","https://openalex.org/W2012670464","https://openalex.org/W2036287059","https://openalex.org/W2045495924","https://openalex.org/W2052569640","https://openalex.org/W2064466058","https://openalex.org/W2068737686","https://openalex.org/W2081210343","https://openalex.org/W2093808275","https://openalex.org/W2098679902","https://openalex.org/W2100007248","https://openalex.org/W2101108755","https://openalex.org/W2102267996","https://openalex.org/W2103018059","https://openalex.org/W2107598941","https://openalex.org/W2115461474","https://openalex.org/W2118038484","https://openalex.org/W2120143966","https://openalex.org/W2120340025","https://openalex.org/W2120615054","https://openalex.org/W2125972432","https://openalex.org/W2129629757","https://openalex.org/W2132679783","https://openalex.org/W2135209143","https://openalex.org/W2135779000","https://openalex.org/W2135912864","https://openalex.org/W2138243089","https://openalex.org/W2144416276","https://openalex.org/W2144810465","https://openalex.org/W2145453687","https://openalex.org/W2150588363","https://openalex.org/W2166706236","https://openalex.org/W2167571757","https://openalex.org/W2250545651","https://openalex.org/W2251419334","https://openalex.org/W2251960799","https://openalex.org/W2268628077","https://openalex.org/W2396924315","https://openalex.org/W2401642934","https://openalex.org/W2406996511","https://openalex.org/W2521714068","https://openalex.org/W2560674852","https://openalex.org/W2604738282","https://openalex.org/W2951781666","https://openalex.org/W2964244261","https://openalex.org/W4292691288","https://openalex.org/W4293052541","https://openalex.org/W6628630383","https://openalex.org/W6629638141","https://openalex.org/W6635684370","https://openalex.org/W6678578999","https://openalex.org/W6680029872","https://openalex.org/W6684249991","https://openalex.org/W6713468891","https://openalex.org/W6841455387"],"related_works":["https://openalex.org/W3085968255","https://openalex.org/W2483382562","https://openalex.org/W2512945146","https://openalex.org/W2460926837","https://openalex.org/W2762065037","https://openalex.org/W2626533837","https://openalex.org/W2367328361","https://openalex.org/W296732031","https://openalex.org/W2403346309","https://openalex.org/W41925032"],"abstract_inverted_index":{"DeepDive":[0,71,103,121,140,151],"is":[1,72,152,171],"a":[2,55,62,96,142,164],"system":[3],"for":[4,126,145],"extracting":[5],"relational":[6,31,56],"databases":[7,106],"from":[8],"<i>dark":[9],"data</i>:":[10],"the":[11,34],"mass":[12],"of":[13,68,98,112],"text,":[14],"tables,":[15],"and":[16,22,48,64,89,134,148,180],"images":[17],"that":[18,109,111,158],"are":[19],"widely":[20],"collected":[21],"stored":[23],"but":[24],"which":[25],"cannot":[26],"be":[27],"exploited":[28],"by":[29,139,154,173],"standard":[30],"tools.":[32],"If":[33],"information":[35,78],"in":[36,54,81,95],"dark":[37],"data":[38,137],"-":[39,51],"scientific":[40,149],"papers,":[41],"Web":[42],"classified":[43],"ads,":[44],"customer":[45],"service":[46],"notes,":[47],"so":[49],"on":[50],"were":[52],"instead":[53],"database,":[57],"it":[58],"would":[59],"give":[60],"analysts":[61],"massive":[63,143],"valuable":[65],"new":[66],"set":[67],"\"big":[69],"data.\"":[70],"distinctive":[73],"when":[74],"compared":[75],"to":[76,84,104,122],"previous":[77],"extraction":[79],"systems":[80],"its":[82],"ability":[83],"obtain":[85],"very":[86],"high":[87],"precision":[88],"recall":[90],"at":[91],"reasonable":[92],"engineering":[93],"cost;":[94],"number":[97],"applications,":[99],"we":[100,117],"have":[101,118],"used":[102],"create":[105,123],"with":[107,163],"accuracy":[108],"meets":[110],"human":[113],"annotators.":[114],"To":[115],"date":[116],"successfully":[119],"deployed":[120],"data-centric":[124],"applications":[125],"insurance,":[127],"materials":[128],"science,":[129],"genomics,":[130],"paleontologists,":[131],"law":[132],"enforcement,":[133],"others.":[135],"The":[136],"unlocked":[138],"represents":[141],"opportunity":[144],"industry,":[146],"government,":[147],"researchers.":[150],"enabled":[153,172],"an":[155],"unusual":[156],"design":[157,170],"combines":[159],"large-scale":[160],"probabilistic":[161,178],"inference":[162],"novel":[165],"developer":[166],"interaction":[167],"cycle.":[168],"This":[169],"several":[174],"core":[175],"innovations":[176],"around":[177],"training":[179],"inference.":[181]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":5},{"year":2019,"cited_by_count":11},{"year":2018,"cited_by_count":5},{"year":2017,"cited_by_count":5},{"year":2016,"cited_by_count":3}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
