{"id":"https://openalex.org/W2782619198","doi":"https://doi.org/10.1109/bigdata.2017.8257934","title":"Sanzu: A data science benchmark","display_name":"Sanzu: A data science benchmark","publication_year":2017,"publication_date":"2017-12-01","ids":{"openalex":"https://openalex.org/W2782619198","doi":"https://doi.org/10.1109/bigdata.2017.8257934","mag":"2782619198"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata.2017.8257934","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata.2017.8257934","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2017 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5085815170","display_name":"Alex Watson","orcid":"https://orcid.org/0000-0002-1200-1962"},"institutions":[{"id":"https://openalex.org/I106938459","display_name":"University of New Brunswick","ror":"https://ror.org/05nkf0n29","country_code":"CA","type":"education","lineage":["https://openalex.org/I106938459"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Alex Watson","raw_affiliation_strings":["Faculty of Computer Science, University of New Brunswick, Fredericton, Canada"],"affiliations":[{"raw_affiliation_string":"Faculty of Computer Science, University of New Brunswick, Fredericton, Canada","institution_ids":["https://openalex.org/I106938459"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036143102","display_name":"Deepigha Shree Vittal Babu","orcid":null},"institutions":[{"id":"https://openalex.org/I106938459","display_name":"University of New Brunswick","ror":"https://ror.org/05nkf0n29","country_code":"CA","type":"education","lineage":["https://openalex.org/I106938459"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Deepigha Shree Vittal Babu","raw_affiliation_strings":["University of New Brunswick, Fredericton, NB, CA"],"affiliations":[{"raw_affiliation_string":"University of New Brunswick, Fredericton, NB, CA","institution_ids":["https://openalex.org/I106938459"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5009118014","display_name":"Suprio Ray","orcid":"https://orcid.org/0000-0003-0681-9685"},"institutions":[{"id":"https://openalex.org/I106938459","display_name":"University of New Brunswick","ror":"https://ror.org/05nkf0n29","country_code":"CA","type":"education","lineage":["https://openalex.org/I106938459"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Suprio Ray","raw_affiliation_strings":["Faculty of Computer Science, University of New Brunswick, Fredericton, Canada"],"affiliations":[{"raw_affiliation_string":"Faculty of Computer Science, University of New Brunswick, Fredericton, Canada","institution_ids":["https://openalex.org/I106938459"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5085815170"],"corresponding_institution_ids":["https://openalex.org/I106938459"],"apc_list":null,"apc_paid":null,"fwci":0.182,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.59302191,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"263","last_page":"272"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12205","display_name":"Time Series Analysis and Forecasting","score":0.9943000078201294,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9914000034332275,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8183208703994751},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7813734412193298},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6370031833648682},{"id":"https://openalex.org/keywords/python","display_name":"Python (programming language)","score":0.5686349868774414},{"id":"https://openalex.org/keywords/analytics","display_name":"Analytics","score":0.5380340218544006},{"id":"https://openalex.org/keywords/data-analysis","display_name":"Data analysis","score":0.5230551362037659},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.47158944606781006},{"id":"https://openalex.org/keywords/macro","display_name":"Macro","score":0.4609978199005127},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.45354026556015015},{"id":"https://openalex.org/keywords/data-modeling","display_name":"Data modeling","score":0.4524173140525818},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.4034520983695984},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3597787618637085},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.3449593782424927},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.11475867033004761}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8183208703994751},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7813734412193298},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6370031833648682},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.5686349868774414},{"id":"https://openalex.org/C79158427","wikidata":"https://www.wikidata.org/wiki/Q485396","display_name":"Analytics","level":2,"score":0.5380340218544006},{"id":"https://openalex.org/C175801342","wikidata":"https://www.wikidata.org/wiki/Q1988917","display_name":"Data analysis","level":2,"score":0.5230551362037659},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.47158944606781006},{"id":"https://openalex.org/C166955791","wikidata":"https://www.wikidata.org/wiki/Q629579","display_name":"Macro","level":2,"score":0.4609978199005127},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.45354026556015015},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.4524173140525818},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.4034520983695984},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3597787618637085},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.3449593782424927},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.11475867033004761},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/bigdata.2017.8257934","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata.2017.8257934","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2017 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W838923341","https://openalex.org/W1548849615","https://openalex.org/W1985229168","https://openalex.org/W2044849727","https://openalex.org/W2052312648","https://openalex.org/W2054905474","https://openalex.org/W2072916763","https://openalex.org/W2080731889","https://openalex.org/W2115687261","https://openalex.org/W2241506117","https://openalex.org/W2553567329","https://openalex.org/W2567162135","https://openalex.org/W2585291876","https://openalex.org/W2616102449","https://openalex.org/W2620485168","https://openalex.org/W2951981277","https://openalex.org/W6623450909","https://openalex.org/W6646430562"],"related_works":["https://openalex.org/W4226266853","https://openalex.org/W4210252074","https://openalex.org/W3092201768","https://openalex.org/W2796632413","https://openalex.org/W2740083192","https://openalex.org/W2794907032","https://openalex.org/W4255802207","https://openalex.org/W4299701476","https://openalex.org/W2904574413","https://openalex.org/W2462007151"],"abstract_inverted_index":{"The":[0,69,147,195,245],"volume":[1],"of":[2,71,83,100,157,184],"data":[3,19,45,47,60,66,76,114,118,126,134,164,213,249,253],"that":[4,63],"is":[5,9,13,105],"generated":[6,250],"each":[7],"day":[8],"rising":[10],"rapidly.":[11],"There":[12],"a":[14,28,38,101,125,142,182,191],"need":[15],"to":[16,94,130],"analyze":[17],"this":[18,55,121],"efficiently":[20],"and":[21,33,51,74,87,98,136,144,162,170,201,216,224,239],"produce":[22],"results":[23],"quickly.":[24],"Data":[25],"science":[26,67,127,214],"offers":[27],"formal":[29],"methodology":[30],"for":[31,109,116,160],"processing":[32,135],"analyzing":[34],"data.":[35],"It":[36,155],"involves":[37],"work-flow":[39],"with":[40,133,233],"multiple":[41],"stages,":[42],"such":[43],"as,":[44],"collection,":[46],"wrangling,":[48,165],"statistical":[49,166],"analysis":[50,185],"machine":[52,168],"learning.":[53],"In":[54,120],"paper,":[56,122],"we":[57,123,229],"look":[58],"at":[59],"analytics":[61,77,137,179],"systems":[62,78,115,132],"support":[64],"the":[65,96],"work-flow.":[68],"variety":[70],"current":[72],"commercial":[73],"open-source":[75],"differ":[79],"significantly":[80],"in":[81,153],"terms":[82],"available":[84],"features,":[85],"functionality,":[86],"scalability.":[88],"A":[89],"benchmark":[90,108,140,149,197,228,247],"can":[91],"be":[92],"used":[93],"evaluate":[95,131,206],"functionality":[97],"performance":[99],"system.":[102],"However,":[103],"there":[104],"no":[106],"standard":[107],"evaluating":[110],"or":[111,186],"comparing":[112],"these":[113,207],"doing":[117],"science.":[119],"introduce":[124],"benchmark,":[128],"Sanzu,":[129],"tasks.":[138],"Our":[139],"includes":[141],"micro":[143,148,227],"macro":[145,175,196,246],"benchmark.":[146],"tests":[150],"basic":[151],"operations":[152],"isolation.":[154],"consists":[156],"task":[158],"suites":[159],"reading":[161],"writing,":[163],"analysis,":[167],"learning":[169],"time":[171],"series":[172,183],"analysis.":[173],"Each":[174],"workload":[176],"evaluates":[177],"an":[178],"application":[180],"where":[181],"functions":[187],"are":[188],"based":[189],"on":[190,199,209],"real":[192],"world":[193],"application.":[194],"focuses":[198],"sports":[200],"smart":[202],"grid":[203],"analytics.":[204],"We":[205],"tasks":[208],"five":[210],"different":[211],"popular":[212],"frameworks":[215],"systems:":[217],"R,":[218],"Anaconda":[219],"Python,":[220],"Dask,":[221],"PostgreSQL":[222],"(MADlib)":[223],"PySpark.":[225],"For":[226],"generate":[230],"synthetic":[231],"datasets":[232],"3":[234],"scale":[235],"factors:":[236],"1,":[237],"10":[238],"100":[240],"(scale":[241],"factor":[242],"1=1":[243],"million).":[244],"uses":[248],"from":[251],"real-world":[252],"sources.":[254]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":3},{"year":2020,"cited_by_count":1},{"year":2017,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
