{"id":"https://openalex.org/W2775429442","doi":"https://doi.org/10.1145/3148055.3148060","title":"Managing Variant Calling Files the Big Data Way","display_name":"Managing Variant Calling Files the Big Data Way","publication_year":2017,"publication_date":"2017-12-01","ids":{"openalex":"https://openalex.org/W2775429442","doi":"https://doi.org/10.1145/3148055.3148060","mag":"2775429442"},"language":"en","primary_location":{"id":"doi:10.1145/3148055.3148060","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3148055.3148060","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Fourth IEEE/ACM International Conference on Big Data Computing, Applications and Technologies","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://research.wur.nl/en/publications/managing-variant-calling-files-the-big-data-way-using-hdfs-and-ap","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5000840912","display_name":"Katerina Boufea","orcid":"https://orcid.org/0000-0001-8394-1137"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Aikaterini Boufea","raw_affiliation_strings":["University of Edinburgh, Edinburgh, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh, Edinburgh, United Kingdom","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091360374","display_name":"Richard Finkers","orcid":"https://orcid.org/0000-0002-4368-8058"},"institutions":[{"id":"https://openalex.org/I913481162","display_name":"Wageningen University & Research","ror":"https://ror.org/04qw24q55","country_code":"NL","type":"education","lineage":["https://openalex.org/I913481162"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Richard Finkers","raw_affiliation_strings":["Wageningen University &amp; Research, Wageningen, Netherlands"],"affiliations":[{"raw_affiliation_string":"Wageningen University &amp; Research, Wageningen, Netherlands","institution_ids":["https://openalex.org/I913481162"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026426014","display_name":"Martijn van Kaauwen","orcid":"https://orcid.org/0000-0002-6587-8178"},"institutions":[{"id":"https://openalex.org/I913481162","display_name":"Wageningen University & Research","ror":"https://ror.org/04qw24q55","country_code":"NL","type":"education","lineage":["https://openalex.org/I913481162"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Martijn van Kaauwen","raw_affiliation_strings":["Wageningen University &amp; Research, Wageningen, Netherlands"],"affiliations":[{"raw_affiliation_string":"Wageningen University &amp; Research, Wageningen, Netherlands","institution_ids":["https://openalex.org/I913481162"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010873632","display_name":"Mark Kramer","orcid":"https://orcid.org/0000-0003-2237-7549"},"institutions":[{"id":"https://openalex.org/I913481162","display_name":"Wageningen University & Research","ror":"https://ror.org/04qw24q55","country_code":"NL","type":"education","lineage":["https://openalex.org/I913481162"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Mark Kramer","raw_affiliation_strings":["Wageningen University &amp; Research, Wageningen, Netherlands"],"affiliations":[{"raw_affiliation_string":"Wageningen University &amp; Research, Wageningen, Netherlands","institution_ids":["https://openalex.org/I913481162"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5070260526","display_name":"Ioannis N. Athanasiadis","orcid":"https://orcid.org/0000-0003-2764-0078"},"institutions":[{"id":"https://openalex.org/I913481162","display_name":"Wageningen University & Research","ror":"https://ror.org/04qw24q55","country_code":"NL","type":"education","lineage":["https://openalex.org/I913481162"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Ioannis N. Athanasiadis","raw_affiliation_strings":["Wageningen University &amp; Research, Wageningen, Netherlands"],"affiliations":[{"raw_affiliation_string":"Wageningen University &amp; Research, Wageningen, Netherlands","institution_ids":["https://openalex.org/I913481162"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5000840912"],"corresponding_institution_ids":["https://openalex.org/I98677209"],"apc_list":null,"apc_paid":null,"fwci":0.2127,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.55339675,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"219","last_page":"226"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9904999732971191,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9904999732971191,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10885","display_name":"Gene expression and cancer classification","score":0.9732000231742859,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9610000252723694,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7499090433120728},{"id":"https://openalex.org/keywords/spark","display_name":"SPARK (programming language)","score":0.7229287624359131},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.6716264486312866},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.5780576467514038},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.5452391505241394},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.5306472182273865},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.5039317011833191},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4949273467063904},{"id":"https://openalex.org/keywords/cluster","display_name":"Cluster (spacecraft)","score":0.4840223491191864},{"id":"https://openalex.org/keywords/file-format","display_name":"File format","score":0.46017614006996155},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.34182900190353394},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.13674026727676392}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7499090433120728},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.7229287624359131},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.6716264486312866},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.5780576467514038},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.5452391505241394},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.5306472182273865},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.5039317011833191},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4949273467063904},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.4840223491191864},{"id":"https://openalex.org/C97250363","wikidata":"https://www.wikidata.org/wiki/Q235557","display_name":"File format","level":2,"score":0.46017614006996155},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.34182900190353394},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.13674026727676392},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3148055.3148060","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3148055.3148060","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Fourth IEEE/ACM International Conference on Big Data Computing, Applications and Technologies","raw_type":"proceedings-article"},{"id":"pmh:wur:oai:library.wur.nl:wurpubs/532216","is_oa":true,"landing_page_url":"https://research.wur.nl/en/publications/managing-variant-calling-files-the-big-data-way-using-hdfs-and-ap","pdf_url":null,"source":{"id":"https://openalex.org/S4306401843","display_name":"Data Archiving and Networked Services (DANS)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1322597698","host_organization_name":"Royal Netherlands Academy of Arts and Sciences","host_organization_lineage":["https://openalex.org/I1322597698"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"BDCAT '17 Proceedings of the Fourth IEEE/ACM International Conference on Big Data Computing, Applications and Technologies. ACM","raw_type":"info:eu-repo/semantics/bookpart"}],"best_oa_location":{"id":"pmh:wur:oai:library.wur.nl:wurpubs/532216","is_oa":true,"landing_page_url":"https://research.wur.nl/en/publications/managing-variant-calling-files-the-big-data-way-using-hdfs-and-ap","pdf_url":null,"source":{"id":"https://openalex.org/S4306401843","display_name":"Data Archiving and Networked Services (DANS)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1322597698","host_organization_name":"Royal Netherlands Academy of Arts and Sciences","host_organization_lineage":["https://openalex.org/I1322597698"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"BDCAT '17 Proceedings of the Fourth IEEE/ACM International Conference on Big Data Computing, Applications and Technologies. ACM","raw_type":"info:eu-repo/semantics/bookpart"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W1990073900","https://openalex.org/W2104549677","https://openalex.org/W2117798581","https://openalex.org/W2121762798","https://openalex.org/W2142680014","https://openalex.org/W2147818769","https://openalex.org/W2147869723","https://openalex.org/W2149059931","https://openalex.org/W2149992227","https://openalex.org/W2159670728","https://openalex.org/W2189371416","https://openalex.org/W2191556466","https://openalex.org/W2281926395","https://openalex.org/W2310259903","https://openalex.org/W2324218546","https://openalex.org/W2484228491","https://openalex.org/W2542459869","https://openalex.org/W2574494928","https://openalex.org/W2794452822","https://openalex.org/W6697974936"],"related_works":["https://openalex.org/W2960264696","https://openalex.org/W3090563135","https://openalex.org/W2766461310","https://openalex.org/W4247566972","https://openalex.org/W3202731209","https://openalex.org/W4240326769","https://openalex.org/W3211874991","https://openalex.org/W3012518171","https://openalex.org/W2906268655","https://openalex.org/W4241605045"],"abstract_inverted_index":{"Big":[0],"Data":[1],"has":[2],"been":[3],"seen":[4],"as":[5,122],"a":[6,36,42,82,111],"remedy":[7],"for":[8,45,162],"the":[9,13,22,65,71,77,90,96,106,125],"efficient":[10],"management":[11],"of":[12,24,113,119,135],"ever-increasing":[14],"genomic":[15],"data.":[16],"In":[17],"this":[18],"paper,":[19],"we":[20],"investigate":[21],"use":[23],"Apache":[25,50,97,102,147,158],"Spark":[26],"to":[27,49,57,76],"store":[28],"and":[29,54,115,131,138,146,166,169],"process":[30],"Variant":[31],"Calling":[32],"Files":[33],"(VCF)":[34],"on":[35,81],"Hadoop":[37,83],"cluster.":[38],"We":[39,62,128,155],"demonstrate":[40],"Tomatula,":[41],"software":[43],"tool":[44],"converting":[46],"VCF":[47,86,107,120,144],"files":[48,121,145],"Parquet":[51,98,103,148,159],"storage":[52,100,136,164],"format,":[53,93],"an":[55,150],"application":[56],"query":[58,72],"variant":[59],"calling":[60],"datasets.":[61,174],"evaluate":[63],"how":[64],"wall":[66,167],"time":[67,69],"(i.e.":[68],"until":[70],"answer":[73],"is":[74],"returned":[75],"user)":[78],"scales":[79,170],"out":[80,171],"cluster":[84],"storing":[85],"files,":[87],"either":[88],"in":[89,133],"original":[91],"flat-file":[92],"or":[94],"using":[95,149],"columnar":[99],"format.":[101],"can":[104],"compress":[105],"data":[108],"by":[109],"around":[110],"factor":[112],"10,":[114],"supports":[116],"easier":[117],"querying":[118,139],"it":[123],"exposes":[124],"field":[126],"structure.":[127],"discuss":[129],"advantages":[130],"disadvantages":[132],"terms":[134],"capacity":[137],"performance":[140],"with":[141,172],"both":[142],"flat":[143],"open":[151],"plant":[152],"breeding":[153],"dataset.":[154],"conclude":[156],"that":[157],"offers":[160],"benefits":[161],"reducing":[163],"size":[165],"time,":[168],"larger":[173]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
