{"id":"https://openalex.org/W2889015391","doi":"https://doi.org/10.14778/3236187.3236207","title":"Filter before you parse","display_name":"Filter before you parse","publication_year":2018,"publication_date":"2018-07-01","ids":{"openalex":"https://openalex.org/W2889015391","doi":"https://doi.org/10.14778/3236187.3236207","mag":"2889015391"},"language":"en","primary_location":{"id":"doi:10.14778/3236187.3236207","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3236187.3236207","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5014756843","display_name":"Shoumik Palkar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shoumik Palkar","raw_affiliation_strings":["Stanford InfoLab"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Stanford InfoLab","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059130960","display_name":"Firas Abuzaid","orcid":"https://orcid.org/0000-0002-1424-4554"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Firas Abuzaid","raw_affiliation_strings":["Stanford InfoLab"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Stanford InfoLab","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002538469","display_name":"Peter Bailis","orcid":"https://orcid.org/0000-0003-1166-7823"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peter Bailis","raw_affiliation_strings":["Stanford InfoLab"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Stanford InfoLab","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5005554337","display_name":"Matei Zaharia","orcid":"https://orcid.org/0000-0002-7547-7204"},"institutions":[{"id":"https://openalex.org/I4401726825","display_name":"Databricks (United States)","ror":"https://ror.org/01ynzx943","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726825"]}],"countries":[],"is_corresponding":false,"raw_author_name":"Matei Zaharia","raw_affiliation_strings":["Databricks Inc"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Databricks Inc","institution_ids":["https://openalex.org/I4401726825"]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":5.5339,"has_fulltext":false,"cited_by_count":55,"citation_normalized_percentile":{"value":0.96177812,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":"11","issue":"11","first_page":"1576","last_page":"1589"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.874479353427887},{"id":"https://openalex.org/keywords/json","display_name":"JSON","score":0.845730185508728},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.7515354156494141},{"id":"https://openalex.org/keywords/cascade","display_name":"Cascade","score":0.6276338696479797},{"id":"https://openalex.org/keywords/false-positive-paradox","display_name":"False positive paradox","score":0.5600012540817261},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5540555715560913},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.48318710923194885},{"id":"https://openalex.org/keywords/raw-data","display_name":"Raw data","score":0.4486340880393982},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.4340072274208069},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3643258213996887},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.3596566319465637},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.35338830947875977},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.26909786462783813},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.20769768953323364}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.874479353427887},{"id":"https://openalex.org/C2780416260","wikidata":"https://www.wikidata.org/wiki/Q2063","display_name":"JSON","level":2,"score":0.845730185508728},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.7515354156494141},{"id":"https://openalex.org/C34146451","wikidata":"https://www.wikidata.org/wiki/Q5048094","display_name":"Cascade","level":2,"score":0.6276338696479797},{"id":"https://openalex.org/C64869954","wikidata":"https://www.wikidata.org/wiki/Q1859747","display_name":"False positive paradox","level":2,"score":0.5600012540817261},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5540555715560913},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.48318710923194885},{"id":"https://openalex.org/C132964779","wikidata":"https://www.wikidata.org/wiki/Q2110223","display_name":"Raw data","level":2,"score":0.4486340880393982},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.4340072274208069},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3643258213996887},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3596566319465637},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.35338830947875977},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.26909786462783813},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.20769768953323364},{"id":"https://openalex.org/C555944384","wikidata":"https://www.wikidata.org/wiki/Q249","display_name":"Wireless","level":2,"score":0.0},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.0},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/3236187.3236207","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3236187.3236207","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W24659197","https://openalex.org/W1946402190","https://openalex.org/W1964356327","https://openalex.org/W1979315462","https://openalex.org/W2001637908","https://openalex.org/W2005294557","https://openalex.org/W2010090589","https://openalex.org/W2010150096","https://openalex.org/W2011716962","https://openalex.org/W2019386158","https://openalex.org/W2027748129","https://openalex.org/W2038412523","https://openalex.org/W2050810837","https://openalex.org/W2101207848","https://openalex.org/W2103207352","https://openalex.org/W2108220022","https://openalex.org/W2131975293","https://openalex.org/W2134684274","https://openalex.org/W2135611729","https://openalex.org/W2151892596","https://openalex.org/W2155970976","https://openalex.org/W2156317779","https://openalex.org/W2164598857","https://openalex.org/W2170853664","https://openalex.org/W2303430686","https://openalex.org/W2521416656","https://openalex.org/W2595296784","https://openalex.org/W2619959750","https://openalex.org/W2728889630","https://openalex.org/W2751762880","https://openalex.org/W4300782170","https://openalex.org/W6640789038","https://openalex.org/W6714018970"],"related_works":["https://openalex.org/W2070147537","https://openalex.org/W2026046761","https://openalex.org/W4235735989","https://openalex.org/W2467097043","https://openalex.org/W1998962249","https://openalex.org/W2474495446","https://openalex.org/W1512248002","https://openalex.org/W2171794906","https://openalex.org/W2139865158","https://openalex.org/W4229505676"],"abstract_inverted_index":{"Exploratory":[0],"big":[1],"data":[2,11,182],"applications":[3,21,201],"often":[4],"run":[5],"on":[6,46],"raw":[7,49,58,78,93],"unstructured":[8],"or":[9,17],"semi-structured":[10],"formats,":[12],"such":[13,216],"as":[14,217],"JSON":[15],"files":[16],"text":[18],"logs.":[19],"These":[20],"can":[22],"spend":[23],"80--90%":[24],"of":[25,63,70,89,144,154],"their":[26],"execution":[27],"time":[28],"parsing":[29,126,178],"the":[30,47,61,67,120,129,142,147,155],"data.":[31],"In":[32],"this":[33,42],"paper,":[34],"we":[35,56,135],"propose":[36,136],"a":[37,80,87,170,177,181,185,193],"new":[38],"approach":[39],"for":[40],"reducing":[41],"overhead:":[43],"apply":[44],"filters":[45,94],"data's":[48],"bytestream":[50],"before":[51],"parsing.":[52],"This":[53],"technique,":[54],"which":[55,174],"call":[57],"filtering,":[59,79],"leverages":[60],"features":[62],"modern":[64],"hardware":[65],"and":[66,124,192,205,224],"high":[68],"selectivity":[69],"queries":[71],"found":[72],"in":[73,169,184],"many":[74,199],"exploratory":[75],"applications.":[76],"With":[77],"user-specified":[81],"query":[82],"predicate":[83],"is":[84,133],"compiled":[85],"into":[86,114],"set":[88],"filtering":[90],"primitives":[91],"called":[92,172],"(RFs).":[95],"RFs":[96,113,145],"are":[97,202],"fast,":[98],"SIMD-based":[99],"operators":[100],"that":[101,139,198],"occasionally":[102],"yield":[103],"false":[104,108,121],"positives,":[105],"but":[106],"never":[107],"negatives.":[109],"We":[110,165,196],"combine":[111],"multiple":[112],"an":[115,137],"RF":[116,131],"cascade":[117,132,158,179],"to":[118,221,231],"decrease":[119],"positive":[122],"rate":[123],"maximize":[125],"throughput.":[127],"Because":[128],"best":[130,148],"data-dependent,":[134],"optimizer":[138],"dynamically":[140],"selects":[141],"combination":[143],"with":[146],"expected":[149],"throughput,":[150],"achieving":[151],"within":[152],"10%":[153],"global":[156],"optimum":[157],"while":[159],"adding":[160],"less":[161],"than":[162],"1.2%":[163],"overhead.":[164],"implement":[166],"these":[167],"techniques":[168],"system":[171],"Sparser,":[173],"automatically":[175],"manages":[176],"given":[180],"stream":[183],"supported":[186],"format":[187],"(e.g.,":[188],"JSON,":[189],"Avro,":[190],"Parquet)":[191],"user":[194],"query.":[195],"show":[197],"real-world":[200],"highly":[203],"selective":[204],"benefit":[206],"from":[207],"Sparser.":[208],"Across":[209],"diverse":[210],"workloads,":[211],"Sparser":[212],"accelerates":[213],"state-of-the-art":[214],"parsers":[215],"Mison":[218],"by":[219,229],"up":[220,230],"22":[222],"\u00d7":[223],"improves":[225],"end-to-end":[226],"application":[227],"performance":[228],"9":[232],"\u00d7.":[233]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":8},{"year":2022,"cited_by_count":9},{"year":2021,"cited_by_count":9},{"year":2020,"cited_by_count":10},{"year":2019,"cited_by_count":9}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
