{"id":"https://openalex.org/W3133954690","doi":"https://doi.org/10.1093/bioinformatics/btab323","title":"Compact and evenly distributed <i>k</i>-mer binning for genomic sequences","display_name":"Compact and evenly distributed <i>k</i>-mer binning for genomic sequences","publication_year":2021,"publication_date":"2021-05-01","ids":{"openalex":"https://openalex.org/W3133954690","doi":"https://doi.org/10.1093/bioinformatics/btab323","mag":"3133954690","pmid":"https://pubmed.ncbi.nlm.nih.gov/33970231"},"language":"en","primary_location":{"id":"doi:10.1093/bioinformatics/btab323","is_oa":true,"landing_page_url":"https://doi.org/10.1093/bioinformatics/btab323","pdf_url":null,"source":{"id":"https://openalex.org/S52395412","display_name":"Bioinformatics","issn_l":"1367-4803","issn":["1367-4803","1367-4811"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Bioinformatics","raw_type":"journal-article"},"type":"erratum","indexed_in":["crossref","doaj","pubmed"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1093/bioinformatics/btab323","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5009515716","display_name":"Johan Nystr\u00f6m-Persson","orcid":"https://orcid.org/0000-0002-7951-4762"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Johan Nystr\u00f6m-Persson","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062206228","display_name":"Gabriel Keeble\u2010Gagn\u00e8re","orcid":"https://orcid.org/0000-0002-9165-0724"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gabriel Keeble-Gagn\u00e8re","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5057228948","display_name":"Niamat Zawad","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Niamat Zawad","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5009515716"],"corresponding_institution_ids":[],"apc_list":{"value":3618,"currency":"USD","value_usd":3618},"apc_paid":{"value":3618,"currency":"USD","value_usd":3618},"fwci":0.8134,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.70369515,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":"37","issue":"9","first_page":"1338","last_page":"1338"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9890999794006348,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.9854999780654907,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/k-mer","display_name":"k-mer","score":0.7389631867408752},{"id":"https://openalex.org/keywords/spark","display_name":"SPARK (programming language)","score":0.722719669342041},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6659929156303406},{"id":"https://openalex.org/keywords/metagenomics","display_name":"Metagenomics","score":0.6540156602859497},{"id":"https://openalex.org/keywords/bin","display_name":"Bin","score":0.5334427952766418},{"id":"https://openalex.org/keywords/genome","display_name":"Genome","score":0.4517979323863983},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.40286165475845337},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3866017460823059},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.33647435903549194},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.22584718465805054},{"id":"https://openalex.org/keywords/gene","display_name":"Gene","score":0.09814760088920593}],"concepts":[{"id":"https://openalex.org/C2279292","wikidata":"https://www.wikidata.org/wiki/Q6322851","display_name":"k-mer","level":4,"score":0.7389631867408752},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.722719669342041},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6659929156303406},{"id":"https://openalex.org/C15151743","wikidata":"https://www.wikidata.org/wiki/Q903778","display_name":"Metagenomics","level":3,"score":0.6540156602859497},{"id":"https://openalex.org/C156273044","wikidata":"https://www.wikidata.org/wiki/Q4913766","display_name":"Bin","level":2,"score":0.5334427952766418},{"id":"https://openalex.org/C141231307","wikidata":"https://www.wikidata.org/wiki/Q7020","display_name":"Genome","level":3,"score":0.4517979323863983},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.40286165475845337},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3866017460823059},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.33647435903549194},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.22584718465805054},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.09814760088920593},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1093/bioinformatics/btab323","is_oa":true,"landing_page_url":"https://doi.org/10.1093/bioinformatics/btab323","pdf_url":null,"source":{"id":"https://openalex.org/S52395412","display_name":"Bioinformatics","issn_l":"1367-4803","issn":["1367-4803","1367-4811"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Bioinformatics","raw_type":"journal-article"},{"id":"pmid:33970231","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/33970231","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Bioinformatics (Oxford, England)","raw_type":null},{"id":"pmh:oai:pubmedcentral.nih.gov:8189688","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/8189688","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Bioinformatics","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1093/bioinformatics/btab323","is_oa":true,"landing_page_url":"https://doi.org/10.1093/bioinformatics/btab323","pdf_url":null,"source":{"id":"https://openalex.org/S52395412","display_name":"Bioinformatics","issn_l":"1367-4803","issn":["1367-4803","1367-4811"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Bioinformatics","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W1969346416","https://openalex.org/W2057253402","https://openalex.org/W2125266506","https://openalex.org/W2127175247","https://openalex.org/W2144560237","https://openalex.org/W2159954944","https://openalex.org/W2574494928","https://openalex.org/W2583363792","https://openalex.org/W2597444305","https://openalex.org/W2748594262","https://openalex.org/W2763390627","https://openalex.org/W2811072203","https://openalex.org/W2897927784","https://openalex.org/W2950261807","https://openalex.org/W2950354111","https://openalex.org/W2951822379","https://openalex.org/W2952224143","https://openalex.org/W2977580499","https://openalex.org/W3016701095","https://openalex.org/W3042377559","https://openalex.org/W3043768949"],"related_works":["https://openalex.org/W4298353702","https://openalex.org/W2949565680","https://openalex.org/W2339602899","https://openalex.org/W2902099752","https://openalex.org/W2964143144","https://openalex.org/W1997673861","https://openalex.org/W1971523492","https://openalex.org/W2057215967","https://openalex.org/W4232608576","https://openalex.org/W2912541378"],"abstract_inverted_index":{"MOTIVATION":[0],"The":[1],"processing":[2,16],"of":[3,6,13,63,124,148,187,190],"k-mers":[4,46],"(subsequences":[5],"length":[7],"k)":[8],"is":[9,206],"at":[10,211,219],"the":[11,122,141,188,195],"foundation":[12],"many":[14],"sequence":[15],"algorithms":[17],"in":[18,53,128,182],"bioinformatics,":[19],"including":[20],"k-mer":[21,110,153,176,200],"counting":[22,111,177,201],"for":[23,32,71,94],"genome":[24,27],"size":[25],"estimation,":[26],"assembly,":[28],"and":[29,73,151,162,209],"taxonomic":[30],"classification":[31],"metagenomics.":[33],"Minimizers-ordered":[34],"m-mers":[35],"where":[36],"m":[37],"<":[38],"k":[39],"-":[40],"are":[41,58,217],"often":[42],"used":[43],"to":[44,60,120,132,173],"group":[45],"into":[47],"bins":[48,62],"as":[49,76,78,183,185],"a":[50,108,145,179],"first":[51],"step":[52],"such":[54],"processing.":[55],"However,":[56],"minimizers":[57,150],"known":[59],"generate":[61],"very":[64],"different":[65],"sizes,":[66],"which":[67,117,156],"can":[68],"pose":[69],"challenges":[70],"distributed":[72,109,160,175,199],"parallel":[74],"processing,":[75],"well":[77],"generally":[79],"increase":[80],"memory":[81,189],"requirements.":[82],"Furthermore,":[83],"although":[84],"various":[85,125],"minimizer":[86,126],"orderings":[87,127],"have":[88],"been":[89,101],"proposed,":[90],"their":[91],"practical":[92],"value":[93],"improving":[95],"tool":[96,112],"efficiency":[97],"has":[98],"not":[99],"yet":[100],"fully":[102],"explored.":[103],"RESULTS":[104],"We":[105,166],"present":[106],"Discount,":[107],"based":[113],"on":[114,178],"Apache":[115],"Spark,":[116],"we":[118,138],"use":[119],"investigate":[121],"behaviour":[123],"practice":[129],"when":[130],"applied":[131],"metagenomics":[133],"data.":[134],"Using":[135],"this":[136,169],"tool,":[137],"then":[139],"introduce":[140],"universal":[142,152],"frequency":[143],"ordering,":[144],"new":[146],"combination":[147],"frequencysampled":[149],"hitting":[154],"sets,":[155],"yields":[157],"both":[158],"evenly":[159],"binning":[161],"small":[163],"bin":[164],"sizes.":[165],"show":[167],"that":[168],"ordering":[170],"allows":[171],"Discount":[172,205],"perform":[174],"large":[180],"dataset":[181],"little":[184],"1/8":[186],"comparable":[191],"approaches,":[192],"making":[193],"it":[194],"most":[196],"efficient":[197],"out-of-core":[198],"method":[202],"available.":[203],"AVAILABILITY":[204],"GPL":[207],"licensed":[208],"available":[210,218],"https://github.com/jtnystrom/discount.":[212],"SUPPLEMENTARY":[213],"INFORMATION":[214],"Supplementary":[215],"data":[216],"Bioinformatics":[220],"online.":[221]},"counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":7},{"year":2021,"cited_by_count":1}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2021-03-15T00:00:00"}
