{"id":"https://openalex.org/W2947571256","doi":"https://doi.org/10.1145/3299869.3319898","title":"Speculative Distributed CSV Data Parsing for Big Data Analytics","display_name":"Speculative Distributed CSV Data Parsing for Big Data Analytics","publication_year":2019,"publication_date":"2019-06-18","ids":{"openalex":"https://openalex.org/W2947571256","doi":"https://doi.org/10.1145/3299869.3319898","mag":"2947571256"},"language":"en","primary_location":{"id":"doi:10.1145/3299869.3319898","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3299869.3319898","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2019 International Conference on Management of Data","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030542531","display_name":"Chang Ge","orcid":"https://orcid.org/0000-0001-8788-4379"},"institutions":[{"id":"https://openalex.org/I151746483","display_name":"University of Waterloo","ror":"https://ror.org/01aff2v68","country_code":"CA","type":"education","lineage":["https://openalex.org/I151746483"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Chang Ge","raw_affiliation_strings":["University of Waterloo, Waterloo, ON, Canada"],"affiliations":[{"raw_affiliation_string":"University of Waterloo, Waterloo, ON, Canada","institution_ids":["https://openalex.org/I151746483"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100599394","display_name":"Yinan Li","orcid":"https://orcid.org/0000-0001-7137-7823"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yinan Li","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079057741","display_name":"Eric Eilebrecht","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Eric Eilebrecht","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067654293","display_name":"Badrish Chandramouli","orcid":"https://orcid.org/0000-0002-8468-4037"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Badrish Chandramouli","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5111504785","display_name":"Donald Kossmann","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Donald Kossmann","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5030542531"],"corresponding_institution_ids":["https://openalex.org/I151746483"],"apc_list":null,"apc_paid":null,"fwci":4.4089,"has_fulltext":false,"cited_by_count":23,"citation_normalized_percentile":{"value":0.94960528,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"883","last_page":"899"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8910808563232422},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.8675938844680786},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.7157195806503296},{"id":"https://openalex.org/keywords/raw-data","display_name":"Raw data","score":0.6525024175643921},{"id":"https://openalex.org/keywords/bottom-up-parsing","display_name":"Bottom-up parsing","score":0.6374529600143433},{"id":"https://openalex.org/keywords/spark","display_name":"SPARK (programming language)","score":0.6004009246826172},{"id":"https://openalex.org/keywords/top-down-parsing","display_name":"Top-down parsing","score":0.575616180896759},{"id":"https://openalex.org/keywords/syntax","display_name":"Syntax","score":0.5481522083282471},{"id":"https://openalex.org/keywords/s-attributed-grammar","display_name":"S-attributed grammar","score":0.5155013203620911},{"id":"https://openalex.org/keywords/parser-combinator","display_name":"Parser combinator","score":0.4914482831954956},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4660421907901764},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.42015528678894043},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3814123868942261},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3611988425254822},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.3466609716415405},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3027332127094269}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8910808563232422},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.8675938844680786},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.7157195806503296},{"id":"https://openalex.org/C132964779","wikidata":"https://www.wikidata.org/wiki/Q2110223","display_name":"Raw data","level":2,"score":0.6525024175643921},{"id":"https://openalex.org/C60690694","wikidata":"https://www.wikidata.org/wiki/Q894902","display_name":"Bottom-up parsing","level":4,"score":0.6374529600143433},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.6004009246826172},{"id":"https://openalex.org/C42560504","wikidata":"https://www.wikidata.org/wiki/Q15419395","display_name":"Top-down parsing","level":3,"score":0.575616180896759},{"id":"https://openalex.org/C60048249","wikidata":"https://www.wikidata.org/wiki/Q37437","display_name":"Syntax","level":2,"score":0.5481522083282471},{"id":"https://openalex.org/C147547768","wikidata":"https://www.wikidata.org/wiki/Q3113342","display_name":"S-attributed grammar","level":3,"score":0.5155013203620911},{"id":"https://openalex.org/C118364021","wikidata":"https://www.wikidata.org/wiki/Q7139956","display_name":"Parser combinator","level":3,"score":0.4914482831954956},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4660421907901764},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.42015528678894043},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3814123868942261},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3611988425254822},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.3466609716415405},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3027332127094269},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3299869.3319898","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3299869.3319898","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2019 International Conference on Management of Data","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W1973189970","https://openalex.org/W1981242859","https://openalex.org/W2002952260","https://openalex.org/W2010150096","https://openalex.org/W2027748129","https://openalex.org/W2038412523","https://openalex.org/W2050810837","https://openalex.org/W2063348921","https://openalex.org/W2074935284","https://openalex.org/W2103207352","https://openalex.org/W2113601246","https://openalex.org/W2121082877","https://openalex.org/W2131975293","https://openalex.org/W2135421348","https://openalex.org/W2138869690","https://openalex.org/W2155970976","https://openalex.org/W2161692763","https://openalex.org/W2179242664","https://openalex.org/W2249895122","https://openalex.org/W2619959750","https://openalex.org/W2889015391"],"related_works":["https://openalex.org/W3143982968","https://openalex.org/W2792937288","https://openalex.org/W3088470625","https://openalex.org/W3035970863","https://openalex.org/W2389755172","https://openalex.org/W4320024782","https://openalex.org/W2164260211","https://openalex.org/W2281562628","https://openalex.org/W2619584063","https://openalex.org/W2804916787"],"abstract_inverted_index":{"There":[0],"has":[1],"been":[2],"a":[3,33,98,134],"recent":[4],"flurry":[5],"of":[6,43,47,61,68,82,89,120],"interest":[7],"in":[8,15,30,36,86,133,151,162],"providing":[9],"query":[10],"capability":[11],"on":[12],"raw":[13,21,48,62,90,110],"data":[14,18,22,39,63,111],"today's":[16],"big":[17,38],"systems.":[19],"These":[20],"must":[23],"be":[24,74],"parsed":[25],"before":[26],"processing":[27],"or":[28],"use":[29],"analytics.":[31],"Thus,":[32],"fundamental":[34],"challenge":[35],"distributed":[37,135,159],"systems":[40],"is":[41,130,140],"that":[42,144,173],"efficient":[44],"parallel":[45],"parsing":[46,59,124,129,160],"data.":[49,91,153],"The":[50],"difficulties":[51],"come":[52],"from":[53],"the":[54,66,78,102,106,115,121,157],"inherent":[55],"ambiguity":[56],"while":[57],"independently":[58],"chunks":[60,88],"without":[64],"knowing":[65],"context":[67],"these":[69,87],"chunks.":[70],"Specifically,":[71],"it":[72,145],"can":[73,146],"difficult":[75],"to":[76,114],"find":[77],"beginnings":[79],"and":[80,84,117,127,171],"ends":[81],"fields":[83],"records":[85],"To":[92],"parallelize":[93],"parsing,":[94],"this":[95],"paper":[96],"proposes":[97],"speculation-based":[99],"approach":[100,139,161],"for":[101],"CSV":[103,152],"format,":[104,122],"arguably":[105],"most":[107],"commonly":[108],"used":[109],"format.":[112],"Due":[113],"syntactic":[116],"statistical":[118],"properties":[119],"speculative":[123,138],"rarely":[125],"fails":[126],"therefore":[128],"efficiently":[131],"parallelized":[132],"setting.":[136],"Our":[137],"also":[141],"robust,":[142],"meaning":[143],"reliably":[147],"detect":[148],"syntax":[149],"errors":[150],"We":[154],"experimentally":[155],"evaluate":[156],"speculative,":[158],"Apache":[163],"Spark":[164],"using":[165],"more":[166],"than":[167],"11,000":[168],"real-world":[169],"datasets,":[170],"show":[172],"our":[174],"parser":[175],"produces":[176],"significant":[177],"performance":[178],"benefits":[179],"over":[180],"existing":[181],"methods.":[182]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":6},{"year":2019,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
