{"id":"https://openalex.org/W2945602030","doi":"https://doi.org/10.1145/3309129.3309134","title":"A Study on Optimizing MarkDuplicate in Genome Sequencing Pipeline","display_name":"A Study on Optimizing MarkDuplicate in Genome Sequencing Pipeline","publication_year":2018,"publication_date":"2018-12-27","ids":{"openalex":"https://openalex.org/W2945602030","doi":"https://doi.org/10.1145/3309129.3309134","mag":"2945602030"},"language":"en","primary_location":{"id":"doi:10.1145/3309129.3309134","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3309129.3309134","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3309129.3309134","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 5th International Conference on Bioinformatics Research and Applications","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3309129.3309134","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5055435419","display_name":"Qi Zhao","orcid":"https://orcid.org/0000-0002-8683-6145"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Qi Zhao","raw_affiliation_strings":["Computer Science Department, Engineering VI, UCLA, Los Angeles, CA"],"affiliations":[{"raw_affiliation_string":"Computer Science Department, Engineering VI, UCLA, Los Angeles, CA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5055435419"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.5536,"has_fulltext":true,"cited_by_count":8,"citation_normalized_percentile":{"value":0.72539484,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"8","last_page":"15"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9890999794006348,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8283121585845947},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.7900815010070801},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5666781067848206},{"id":"https://openalex.org/keywords/java","display_name":"Java","score":0.5308120250701904},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.5217336416244507},{"id":"https://openalex.org/keywords/scala","display_name":"Scala","score":0.5044342279434204},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.46822163462638855},{"id":"https://openalex.org/keywords/spark","display_name":"SPARK (programming language)","score":0.4321630895137787},{"id":"https://openalex.org/keywords/thread","display_name":"Thread (computing)","score":0.4201657474040985},{"id":"https://openalex.org/keywords/genome","display_name":"Genome","score":0.4117887020111084},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.3710075616836548},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.21427783370018005}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8283121585845947},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.7900815010070801},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5666781067848206},{"id":"https://openalex.org/C548217200","wikidata":"https://www.wikidata.org/wiki/Q251","display_name":"Java","level":2,"score":0.5308120250701904},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.5217336416244507},{"id":"https://openalex.org/C109701466","wikidata":"https://www.wikidata.org/wiki/Q460584","display_name":"Scala","level":3,"score":0.5044342279434204},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.46822163462638855},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.4321630895137787},{"id":"https://openalex.org/C138101251","wikidata":"https://www.wikidata.org/wiki/Q213092","display_name":"Thread (computing)","level":2,"score":0.4201657474040985},{"id":"https://openalex.org/C141231307","wikidata":"https://www.wikidata.org/wiki/Q7020","display_name":"Genome","level":3,"score":0.4117887020111084},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3710075616836548},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.21427783370018005},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3309129.3309134","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3309129.3309134","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3309129.3309134","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 5th International Conference on Bioinformatics Research and Applications","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3309129.3309134","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3309129.3309134","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3309129.3309134","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 5th International Conference on Bioinformatics Research and Applications","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure","score":0.44999998807907104}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2945602030.pdf","grobid_xml":"https://content.openalex.org/works/W2945602030.grobid-xml"},"referenced_works_count":21,"referenced_works":["https://openalex.org/W1495226832","https://openalex.org/W1973950074","https://openalex.org/W1996181407","https://openalex.org/W2012016911","https://openalex.org/W2061680337","https://openalex.org/W2074935284","https://openalex.org/W2083854815","https://openalex.org/W2086399100","https://openalex.org/W2095680943","https://openalex.org/W2098131178","https://openalex.org/W2108234281","https://openalex.org/W2118526609","https://openalex.org/W2119180969","https://openalex.org/W2128505445","https://openalex.org/W2131975293","https://openalex.org/W2132378736","https://openalex.org/W2141978878","https://openalex.org/W2189371416","https://openalex.org/W2189465200","https://openalex.org/W6687322159","https://openalex.org/W6891709499"],"related_works":["https://openalex.org/W2058965144","https://openalex.org/W2763674625","https://openalex.org/W4232952072","https://openalex.org/W4252019479","https://openalex.org/W3040669356","https://openalex.org/W2337519567","https://openalex.org/W2548921709","https://openalex.org/W4252076541","https://openalex.org/W2620395718","https://openalex.org/W2950624501"],"abstract_inverted_index":{"MarkDuplicate":[0,42,61,134,157,186],"is":[1,18,163,231],"typically":[2],"one":[3],"of":[4,94,133,150,167,214],"the":[5,10,86,91,114,124,129,181,191,200,211,226],"most":[6],"time-consuming":[7],"operations":[8],"in":[9,26,33,62,237],"whole":[11,216],"genome":[12,27,35,139,143,217],"sequencing":[13],"pipeline.":[14],"Picard":[15,202],"tool,":[16],"which":[17,50],"widely":[19],"used":[20,220],"by":[21,99,145,159,234],"biologists":[22],"to":[23,44,80,110,180],"sort":[24],"reads":[25,32,98],"data":[28,144,218],"and":[29,118,141,154,174,204],"mark":[30],"duplicate":[31,97],"sorted":[34],"data,":[36],"has":[37,51],"relatively":[38],"low":[39],"performance":[40,197],"on":[41,55,74,165],"due":[43],"its":[45],"single-thread":[46],"sequential":[47],"Java":[48],"implementation,":[49],"caused":[52],"serious":[53],"impact":[54],"nowadays":[56],"bioinformatic":[57,77,83],"researches.":[58,84],"To":[59],"accelerate":[60],"Picard,":[63],"we":[64,89,107,127,219,229],"present":[65],"our":[66,184],"two-stage":[67],"optimization":[68],"solution":[69],"as":[70,135],"a":[71,105],"preliminary":[72],"study":[73],"next":[75,125],"generation":[76],"software":[78],"tools":[79],"better":[81,196],"serve":[82],"In":[85],"first":[87],"stage,":[88,126],"improve":[90],"original":[92,201],"algorithm":[93],"tracking":[95],"optical":[96],"eliminating":[100],"large":[101],"redundant":[102],"operations.":[103],"As":[104],"consequence,":[106],"achieve":[108],"up":[109],"50X":[111],"speedup":[112],"for":[113,221],"second":[115],"step":[116],"only":[117,190],"9.57X":[119],"overall":[120],"process":[121],"speedup.":[122],"At":[123],"redesign":[128],"I/O":[130],"processing":[131],"mechanism":[132],"transforming":[136],"between":[137],"on-disk":[138],"file":[140,177],"in-memory":[142],"using":[146],"ADAM":[147],"format":[148],"instead":[149],"previous":[151],"SAM":[152],"format,":[153],"implement":[155],"cloud-scale":[156],"application":[158],"Scala.":[160],"Our":[161],"evaluation":[162,182,222],"performed":[164],"top":[166],"Spark":[168],"cluster":[169],"with":[170,199],"25":[171],"worker":[172],"nodes":[173],"Hadoop":[175],"distributed":[176],"system.":[178],"According":[179],"results,":[183],"cloudscale":[185],"can":[187],"provide":[188],"not":[189],"same":[192],"output":[193],"but":[194],"also":[195],"compared":[198],"tool":[203],"other":[205],"existing":[206],"similar":[207],"tools.":[208],"Specifically,":[209],"among":[210],"13":[212],"sets":[213],"real":[215],"at":[223],"both":[224],"stages,":[225],"best":[227],"improvement":[228,240],"gain":[230],"reducing":[232],"runtime":[233],"92":[235],"hours":[236],"total.":[238],"Average":[239],"reaches":[241],"48.69":[242],"decreasing":[243],"hours.":[244]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":1},{"year":2021,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
