{"id":"https://openalex.org/W2750856415","doi":"https://doi.org/10.1145/3183713.3183746","title":"Navigating the Data Lake with DATAMARAN","display_name":"Navigating the Data Lake with DATAMARAN","publication_year":2018,"publication_date":"2018-05-25","ids":{"openalex":"https://openalex.org/W2750856415","doi":"https://doi.org/10.1145/3183713.3183746","mag":"2750856415"},"language":"en","primary_location":{"id":"doi:10.1145/3183713.3183746","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3183713.3183746","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3183713.3183746","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 International Conference on Management of Data","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3183713.3183746","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033996984","display_name":"Yihan Gao","orcid":"https://orcid.org/0009-0001-7254-7799"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yihan Gao","raw_affiliation_strings":["University of Illinois at Urbana-Champaign, Urbana, IL, USA"],"affiliations":[{"raw_affiliation_string":"University of Illinois at Urbana-Champaign, Urbana, IL, USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090828085","display_name":"Silu Huang","orcid":"https://orcid.org/0000-0002-5291-0167"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Silu Huang","raw_affiliation_strings":["University of Illinois at Urbana-Champaign, Urbana, IL, USA"],"affiliations":[{"raw_affiliation_string":"University of Illinois at Urbana-Champaign, Urbana, IL, USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5013608601","display_name":"Aditya Parameswaran","orcid":"https://orcid.org/0000-0002-4538-4752"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Aditya Parameswaran","raw_affiliation_strings":["University of Illinois at Urbana-Champaign, Urbana, IL, USA"],"affiliations":[{"raw_affiliation_string":"University of Illinois at Urbana-Champaign, Urbana, IL, USA","institution_ids":["https://openalex.org/I157725225"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5033996984"],"corresponding_institution_ids":["https://openalex.org/I157725225"],"apc_list":null,"apc_paid":null,"fwci":5.2948,"has_fulltext":true,"cited_by_count":49,"citation_normalized_percentile":{"value":0.9548074,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"943","last_page":"958"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8334586024284363},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6494561433792114},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5804607272148132},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.5189754366874695},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.4955655336380005},{"id":"https://openalex.org/keywords/semi-structured-data","display_name":"Semi-structured data","score":0.4855794310569763},{"id":"https://openalex.org/keywords/relational-database","display_name":"Relational database","score":0.3662155866622925},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3255041837692261},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.28701961040496826}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8334586024284363},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6494561433792114},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5804607272148132},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.5189754366874695},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.4955655336380005},{"id":"https://openalex.org/C40077939","wikidata":"https://www.wikidata.org/wiki/Q2336004","display_name":"Semi-structured data","level":3,"score":0.4855794310569763},{"id":"https://openalex.org/C5655090","wikidata":"https://www.wikidata.org/wiki/Q192588","display_name":"Relational database","level":2,"score":0.3662155866622925},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3255041837692261},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.28701961040496826},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3183713.3183746","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3183713.3183746","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3183713.3183746","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 International Conference on Management of Data","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3183713.3183746","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3183713.3183746","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3183713.3183746","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 International Conference on Management of Data","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.550000011920929,"display_name":"Life below water","id":"https://metadata.un.org/sdg/14"}],"awards":[{"id":"https://openalex.org/G1010213750","display_name":"AitF: Collaborative Research: Fast, Accurate, and Practical: Adaptive Sublinear Algorithms for Scalable Visualization","funder_award_id":"1733878","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G1391081008","display_name":null,"funder_award_id":"IIS-1513407","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G3379649159","display_name":"CAREER: Advancing Open-Ended Crowdsourcing: The Next Frontier in Crowdsourced Data Management","funder_award_id":"1652750","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G3908901195","display_name":null,"funder_award_id":"IIS-1513407, IIS-1633755, IIS-1733878 and IIS-1652750","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G4334741966","display_name":"III: Medium: Collaborative Research: DataHub - A Collaborative Dataset Management Platform for Data Science","funder_award_id":"1513407","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G5864486752","display_name":null,"funder_award_id":"1633755","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G6872665753","display_name":null,"funder_award_id":"IIS-1652750","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G7880343010","display_name":null,"funder_award_id":"IIS-1513407, IIS-1633755","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2750856415.pdf","grobid_xml":"https://content.openalex.org/works/W2750856415.grobid-xml"},"referenced_works_count":67,"referenced_works":["https://openalex.org/W38765094","https://openalex.org/W200042785","https://openalex.org/W203690387","https://openalex.org/W236085609","https://openalex.org/W1488315845","https://openalex.org/W1538375546","https://openalex.org/W1553019137","https://openalex.org/W1562942180","https://openalex.org/W1969621019","https://openalex.org/W1982280055","https://openalex.org/W1993141752","https://openalex.org/W1994584977","https://openalex.org/W1996505782","https://openalex.org/W1999361961","https://openalex.org/W2002956097","https://openalex.org/W2018506753","https://openalex.org/W2023673418","https://openalex.org/W2055228862","https://openalex.org/W2056325176","https://openalex.org/W2064766209","https://openalex.org/W2065568440","https://openalex.org/W2079594573","https://openalex.org/W2092364718","https://openalex.org/W2093559286","https://openalex.org/W2093752301","https://openalex.org/W2102098892","https://openalex.org/W2103931177","https://openalex.org/W2104042955","https://openalex.org/W2104086170","https://openalex.org/W2106895292","https://openalex.org/W2106950427","https://openalex.org/W2108223890","https://openalex.org/W2111869785","https://openalex.org/W2115056012","https://openalex.org/W2115461474","https://openalex.org/W2124410446","https://openalex.org/W2132525863","https://openalex.org/W2132667707","https://openalex.org/W2134172329","https://openalex.org/W2135767707","https://openalex.org/W2137435551","https://openalex.org/W2143309843","https://openalex.org/W2144951274","https://openalex.org/W2145007893","https://openalex.org/W2146105230","https://openalex.org/W2146753709","https://openalex.org/W2150721933","https://openalex.org/W2153130498","https://openalex.org/W2156049581","https://openalex.org/W2163072729","https://openalex.org/W2164119735","https://openalex.org/W2186090720","https://openalex.org/W2252238553","https://openalex.org/W2424304400","https://openalex.org/W2438792749","https://openalex.org/W2574230393","https://openalex.org/W2587580284","https://openalex.org/W2612824201","https://openalex.org/W2913389685","https://openalex.org/W3015882749","https://openalex.org/W3087355157","https://openalex.org/W4237412827","https://openalex.org/W4252980914","https://openalex.org/W4256244129","https://openalex.org/W4299301436","https://openalex.org/W6608051331","https://openalex.org/W6633664799"],"related_works":["https://openalex.org/W2098516643","https://openalex.org/W2576225315","https://openalex.org/W2527777278","https://openalex.org/W60511584","https://openalex.org/W4303984359","https://openalex.org/W2894526135","https://openalex.org/W1705148896","https://openalex.org/W2525788546","https://openalex.org/W2054295791","https://openalex.org/W3153191738"],"abstract_inverted_index":{"Organizations":[0],"routinely":[1],"accumulate":[2],"semi-structured":[3,37,68,111],"log":[4,69,112,123,150,180],"datasets":[5,13,38,70,109,166,181],"generated":[6],"as":[7,28],"the":[8,83,87,135,148,202,210],"output":[9],"of":[10,187,205],"code;":[11],"these":[12,36],"remain":[14],"unused":[15],"and":[16,18,79,92,171],"uninterpreted,":[17],"occupy":[19],"wasted":[20],"space---this":[21],"phenomenon":[22],"has":[23],"been":[24],"colloquially":[25],"referred":[26],"to":[27,34,40,104,120,138,147,190,209],"\"data":[29],"lake''":[30],"problem.":[31],"One":[32],"approach":[33],"leverage":[35],"is":[39],"convert":[41],"them":[42],"into":[43],"a":[44,100],"structured":[45,84,107,162],"relational":[46,108],"format,":[47],"following":[48],"which":[49],"they":[50],"can":[51,93,159,172],"be":[52,139],"analyzed":[53],"in":[54,102,128,155,168],"conjunction":[55],"with":[56,71,116],"other":[57,121],"datasets.":[58],"We":[59],"present":[60],"DATAMARAN,":[61],"an":[62],"tool":[63],"that":[64,152,201],"extracts":[65],"structure":[66,212],"from":[67,86,98,110,164,182,193],"no":[72],"human":[73],"supervision.":[74],"DATAMARAN":[75,131,158,206],"automatically":[76,178],"identifies":[77],"field":[78],"record":[80,136],"endpoints,":[81],"separates":[82],"parts":[85],"unstructured":[88],"noise":[89],"or":[90],"formatting,":[91],"tease":[94],"apart":[95],"multiple":[96],"structures":[97],"within":[99],"dataset,":[101],"order":[103],"efficiently":[105],"extract":[106,161],"datasets,":[113],"at":[114],"scale":[115],"high":[117],"accuracy.":[118],"Compared":[119],"unsupervised":[122,191],"dataset":[124],"extraction":[125,175,203],"tools":[126],"developed":[127],"prior":[129,169,194],"work,":[130,170],"does":[132],"not":[133],"require":[134],"boundaries":[137],"known":[140],"beforehand,":[141],"making":[142],"it":[143],"much":[144],"more":[145],"applicable":[146],"noisy":[149],"files":[151],"are":[153,207],"ubiquitous":[154],"data":[156],"lakes.":[157],"successfully":[160],"information":[163],"all":[165],"used":[167],"achieve":[173],"95%":[174],"accuracy":[176,188],"on":[177],"collected":[179],"GitHub---a":[183],"substantial":[184],"66%":[185],"increase":[186],"compared":[189],"schemes":[192],"work.":[195],"Our":[196],"user":[197],"study":[198],"further":[199],"demonstrates":[200],"results":[204],"closer":[208],"desired":[211],"than":[213],"competing":[214],"algorithms.":[215]},"counts_by_year":[{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":9},{"year":2022,"cited_by_count":6},{"year":2021,"cited_by_count":3},{"year":2020,"cited_by_count":5},{"year":2019,"cited_by_count":12},{"year":2018,"cited_by_count":4}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
