{"id":"https://openalex.org/W2949811559","doi":"https://doi.org/10.1109/bigdata.2016.7840727","title":"Scalable genomics: From raw data to aligned reads on Apache YARN","display_name":"Scalable genomics: From raw data to aligned reads on Apache YARN","publication_year":2016,"publication_date":"2016-12-01","ids":{"openalex":"https://openalex.org/W2949811559","doi":"https://doi.org/10.1109/bigdata.2016.7840727","mag":"2949811559"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata.2016.7840727","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata.2016.7840727","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2016 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010522931","display_name":"Francesco Versaci","orcid":"https://orcid.org/0000-0003-2513-3590"},"institutions":[{"id":"https://openalex.org/I2800759272","display_name":"Center for Advanced Studies Research and Development in Sardinia","ror":"https://ror.org/03jdxdk20","country_code":"IT","type":"facility","lineage":["https://openalex.org/I2800759272"]}],"countries":["IT"],"is_corresponding":true,"raw_author_name":"Francesco Versaci","raw_affiliation_strings":["CRS4, Distributed Computing Group, Pula, Italy"],"affiliations":[{"raw_affiliation_string":"CRS4, Distributed Computing Group, Pula, Italy","institution_ids":["https://openalex.org/I2800759272"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047833562","display_name":"Luca Pireddu","orcid":"https://orcid.org/0000-0002-4663-5613"},"institutions":[{"id":"https://openalex.org/I2800759272","display_name":"Center for Advanced Studies Research and Development in Sardinia","ror":"https://ror.org/03jdxdk20","country_code":"IT","type":"facility","lineage":["https://openalex.org/I2800759272"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Luca Pireddu","raw_affiliation_strings":["CRS4, Distributed Computing Group, Pula, Italy"],"affiliations":[{"raw_affiliation_string":"CRS4, Distributed Computing Group, Pula, Italy","institution_ids":["https://openalex.org/I2800759272"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5073829656","display_name":"Gianluigi Zanetti","orcid":"https://orcid.org/0000-0003-1683-7350"},"institutions":[{"id":"https://openalex.org/I2800759272","display_name":"Center for Advanced Studies Research and Development in Sardinia","ror":"https://ror.org/03jdxdk20","country_code":"IT","type":"facility","lineage":["https://openalex.org/I2800759272"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Gianluigi Zanetti","raw_affiliation_strings":["CRS4, Distributed Computing Group, Pula, Italy"],"affiliations":[{"raw_affiliation_string":"CRS4, Distributed Computing Group, Pula, Italy","institution_ids":["https://openalex.org/I2800759272"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5010522931"],"corresponding_institution_ids":["https://openalex.org/I2800759272"],"apc_list":null,"apc_paid":null,"fwci":1.1671,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.80425805,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":"99","issue":null,"first_page":"1232","last_page":"1241"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9936000108718872,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9889000058174133,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.8851160407066345},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8187273144721985},{"id":"https://openalex.org/keywords/yarn","display_name":"Yarn","score":0.7832760214805603},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.699561357498169},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.654069185256958},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.604828417301178},{"id":"https://openalex.org/keywords/spark","display_name":"SPARK (programming language)","score":0.5506524443626404},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.4929783344268799},{"id":"https://openalex.org/keywords/distributed-data-store","display_name":"Distributed data store","score":0.4417261481285095},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4294368624687195},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.3379729986190796},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.26459193229675293}],"concepts":[{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.8851160407066345},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8187273144721985},{"id":"https://openalex.org/C2778787235","wikidata":"https://www.wikidata.org/wiki/Q49007","display_name":"Yarn","level":2,"score":0.7832760214805603},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.699561357498169},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.654069185256958},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.604828417301178},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.5506524443626404},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4929783344268799},{"id":"https://openalex.org/C24885549","wikidata":"https://www.wikidata.org/wiki/Q339678","display_name":"Distributed data store","level":2,"score":0.4417261481285095},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4294368624687195},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.3379729986190796},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.26459193229675293},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/bigdata.2016.7840727","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata.2016.7840727","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2016 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.4000000059604645,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":47,"referenced_works":["https://openalex.org/W1513515886","https://openalex.org/W1548849615","https://openalex.org/W1904861952","https://openalex.org/W1967841763","https://openalex.org/W1971962952","https://openalex.org/W1977046764","https://openalex.org/W1981509058","https://openalex.org/W1987370132","https://openalex.org/W1990073900","https://openalex.org/W2012016911","https://openalex.org/W2031634592","https://openalex.org/W2036398999","https://openalex.org/W2037748100","https://openalex.org/W2046220835","https://openalex.org/W2061680337","https://openalex.org/W2061906497","https://openalex.org/W2063220786","https://openalex.org/W2081930221","https://openalex.org/W2093931624","https://openalex.org/W2095680943","https://openalex.org/W2097958998","https://openalex.org/W2098380237","https://openalex.org/W2105947650","https://openalex.org/W2107824361","https://openalex.org/W2108234281","https://openalex.org/W2109701112","https://openalex.org/W2119180969","https://openalex.org/W2121762798","https://openalex.org/W2121810937","https://openalex.org/W2124985265","https://openalex.org/W2131229759","https://openalex.org/W2141978878","https://openalex.org/W2153805608","https://openalex.org/W2154462802","https://openalex.org/W2158336776","https://openalex.org/W2169594074","https://openalex.org/W2189371416","https://openalex.org/W2189465200","https://openalex.org/W2190569576","https://openalex.org/W2290761674","https://openalex.org/W2398924165","https://openalex.org/W2554162996","https://openalex.org/W2566979091","https://openalex.org/W4394666350","https://openalex.org/W6687031098","https://openalex.org/W6687322159","https://openalex.org/W6730253985"],"related_works":["https://openalex.org/W2078851640","https://openalex.org/W2381070915","https://openalex.org/W2352481835","https://openalex.org/W2392493391","https://openalex.org/W4200164335","https://openalex.org/W2109471562","https://openalex.org/W2378508949","https://openalex.org/W3199747891","https://openalex.org/W3083262785","https://openalex.org/W2799508461"],"abstract_inverted_index":{"The":[0],"adoption":[1],"of":[2,11,19,60,173,185,213,228,268],"Big":[3],"Data":[4],"technologies":[5,25,67],"can":[6,167,239,245],"potentially":[7],"boost":[8],"the":[9,27,58,69,82,96,111,122,129,134,144,148,183,194,205,211,214,224,229,266,279],"scalability":[10,59,212],"data-driven":[12,35],"biology":[13],"and":[14,76,115,153,166,199,237],"health":[15],"workflows":[16],"by":[17,101,126,151,204],"orders":[18],"magnitude.":[20],"Consider,":[21],"for":[22,257],"instance,":[23],"that":[24],"in":[26,34,182],"Hadoop":[28,70,77,215],"ecosystem":[29,71],"have":[30],"been":[31],"successfully":[32],"used":[33,138,247],"industry":[36],"to":[37,41,121,139,157,196,248,281,285],"scale":[38,168],"their":[39],"processes":[40],"levels":[42],"much":[43],"larger":[44],"than":[45],"any":[46],"biological-or":[47],"health-driven":[48],"work":[49,55],"attempted":[50],"thus":[51,277],"far.":[52],"In":[53,187],"this":[54,189,220],"we":[56],"demonstrate":[57],"a":[61,170,250],"sequence":[62],"alignment":[63],"pipeline":[64,91,179,256],"based":[65,234],"on":[66,81,143,235,241],"from":[68,95,110,133,193],"-":[72,119],"namely,":[73],"Apache":[74,84,275],"Flink":[75],"MapReduce,":[78],"both":[79],"running":[80],"distributed":[83,106,131,165,269],"YARN":[85,145,206],"platform.":[86,146],"Unlike":[87],"previous":[88],"work,":[89],"our":[90,161],"starts":[92],"processing":[93],"directly":[94],"raw":[97],"BCL":[98,113],"data":[99,284],"produced":[100],"Illumina":[102,112,152,258],"sequencers.":[103],"A":[104],"Flink-based":[105],"algorithm":[107],"reconstructs":[108],"reads":[109],"data,":[114,259],"then":[116],"demultiplexes":[117],"them":[118],"analogously":[120],"bcl2fastq2":[123],"program":[124],"provided":[125,203],"Illumina.":[127],"Subsequently,":[128],"BWA-MEM-based":[130],"aligner":[132],"Seal":[135],"project":[136],"is":[137,163,233],"perform":[140],"read":[141],"mapping":[142],"While":[147],"standard":[149],"programs":[150],"BWA-MEM":[154],"are":[155],"limited":[156],"shared-memory":[158],"parallelism":[159],"(multi-threading),":[160],"solution":[162],"completely":[164],"across":[169],"large":[171],"number":[172,184],"computing":[174],"nodes.":[175,186],"Results":[176],"show":[177],"excellent":[178],"scalability,":[180],"linear":[181],"addition,":[188],"approach":[190,222],"automatically":[191],"benefits":[192],"robustness":[195],"hardware":[197],"failure":[198],"transient":[200],"cluster":[201],"problems":[202],"platform,":[207],"as":[208,210,274],"well":[209],"Distributed":[216],"File":[217],"System.":[218],"Moreover,":[219],"YARN-based":[221,253],"complements":[223],"up-and-coming":[225],"version":[226],"4":[227],"GATK":[230],"toolkit,":[231],"which":[232,260],"Spark":[236],"therefore":[238],"run":[240],"YARN.":[242],"Together,":[243],"they":[244],"be":[246,262],"form":[249],"scalable":[251],"complete":[252],"variant":[254],"calling":[255],"will":[261],"further":[263],"improved":[264],"with":[265],"arrival":[267],"in-memory":[270],"filesystem":[271],"technology":[272],"such":[273],"Arrow,":[276],"removing":[278],"need":[280],"write":[282],"intermediate":[283],"disk.":[286]},"counts_by_year":[{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":1},{"year":2018,"cited_by_count":4},{"year":2017,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
