{"id":"https://openalex.org/W6925748783","doi":"https://doi.org/10.17877/de290r-23034","title":"Analysis and application of hash-based similarity estimation techniques for biological sequence analysis","display_name":"Analysis and application of hash-based similarity estimation techniques for biological sequence analysis","publication_year":2021,"publication_date":"2021-01-01","ids":{"openalex":"https://openalex.org/W6925748783","doi":"https://doi.org/10.17877/de290r-23034"},"language":"en","primary_location":{"id":"doi:10.17877/de290r-23034","is_oa":true,"landing_page_url":"https://doi.org/10.17877/de290r-23034","pdf_url":null,"source":{"id":"https://openalex.org/S4306400811","display_name":"Technische Universit\u00e4t Dortmund Eldorado (Technische Universit\u00e4t Dortmund)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210148506","host_organization_name":"Erich-Brost-Institut","host_organization_lineage":["https://openalex.org/I4210148506"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article-journal"},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.17877/de290r-23034","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Timm, Henning","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Timm, Henning","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.2785057,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10572","display_name":"Geophysical and Geoelectrical Methods","score":0.26019999384880066,"subfield":{"id":"https://openalex.org/subfields/1908","display_name":"Geophysics"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10572","display_name":"Geophysical and Geoelectrical Methods","score":0.26019999384880066,"subfield":{"id":"https://openalex.org/subfields/1908","display_name":"Geophysics"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10894","display_name":"Groundwater flow and contamination studies","score":0.17669999599456787,"subfield":{"id":"https://openalex.org/subfields/2305","display_name":"Environmental Engineering"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11609","display_name":"Geophysical Methods and Applications","score":0.14219999313354492,"subfield":{"id":"https://openalex.org/subfields/2212","display_name":"Ocean Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/hash-function","display_name":"Hash function","score":0.8137000203132629},{"id":"https://openalex.org/keywords/hash-table","display_name":"Hash table","score":0.5654000043869019},{"id":"https://openalex.org/keywords/nearest-neighbor-search","display_name":"Nearest neighbor search","score":0.5551000237464905},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.5443999767303467},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.47519999742507935},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4440000057220459},{"id":"https://openalex.org/keywords/locality-sensitive-hashing","display_name":"Locality-sensitive hashing","score":0.44339999556541443},{"id":"https://openalex.org/keywords/biological-data","display_name":"Biological data","score":0.3952000141143799}],"concepts":[{"id":"https://openalex.org/C99138194","wikidata":"https://www.wikidata.org/wiki/Q183427","display_name":"Hash function","level":2,"score":0.8137000203132629},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6862000226974487},{"id":"https://openalex.org/C67388219","wikidata":"https://www.wikidata.org/wiki/Q207440","display_name":"Hash table","level":3,"score":0.5654000043869019},{"id":"https://openalex.org/C116738811","wikidata":"https://www.wikidata.org/wiki/Q608751","display_name":"Nearest neighbor search","level":2,"score":0.5551000237464905},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.551800012588501},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.5443999767303467},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.49970000982284546},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.47519999742507935},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4440000057220459},{"id":"https://openalex.org/C74270461","wikidata":"https://www.wikidata.org/wiki/Q1625299","display_name":"Locality-sensitive hashing","level":4,"score":0.44339999556541443},{"id":"https://openalex.org/C201797286","wikidata":"https://www.wikidata.org/wiki/Q4914986","display_name":"Biological data","level":2,"score":0.3952000141143799},{"id":"https://openalex.org/C87431388","wikidata":"https://www.wikidata.org/wiki/Q2070573","display_name":"Perfect hash function","level":4,"score":0.3937000036239624},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.3725999891757965},{"id":"https://openalex.org/C138111711","wikidata":"https://www.wikidata.org/wiki/Q478351","display_name":"Double hashing","level":4,"score":0.3531999886035919},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.34790000319480896},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.3328999876976013},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.3301999866962433},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.30090001225471497},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.2849999964237213},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2782999873161316},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2700999975204468},{"id":"https://openalex.org/C122907437","wikidata":"https://www.wikidata.org/wiki/Q5318999","display_name":"Dynamic perfect hashing","level":5,"score":0.26589998602867126},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.26190000772476196},{"id":"https://openalex.org/C129848803","wikidata":"https://www.wikidata.org/wiki/Q2564360","display_name":"Sample size determination","level":2,"score":0.26010000705718994}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.17877/de290r-23034","is_oa":true,"landing_page_url":"https://doi.org/10.17877/de290r-23034","pdf_url":null,"source":{"id":"https://openalex.org/S4306400811","display_name":"Technische Universit\u00e4t Dortmund Eldorado (Technische Universit\u00e4t Dortmund)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210148506","host_organization_name":"Erich-Brost-Institut","host_organization_lineage":["https://openalex.org/I4210148506"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article-journal"}],"best_oa_location":{"id":"doi:10.17877/de290r-23034","is_oa":true,"landing_page_url":"https://doi.org/10.17877/de290r-23034","pdf_url":null,"source":{"id":"https://openalex.org/S4306400811","display_name":"Technische Universit\u00e4t Dortmund Eldorado (Technische Universit\u00e4t Dortmund)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210148506","host_organization_name":"Erich-Brost-Institut","host_organization_lineage":["https://openalex.org/I4210148506"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article-journal"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"Bioinformatics,":[1],"a":[2,29,55,87,213,231,235,245,267,343,367,397,405],"large":[3,30],"group":[4],"of":[5,12,18,61,70,89,119,134,143,156,234,248,256,278,287,322,356,362],"problems":[6],"requires":[7],"the":[8,16,59,67,76,90,117,120,132,141,154,254,273,316,320,332,338,353,377,384],"computation":[9],"or":[10],"estimation":[11,69],"sequence":[13,20,71],"similarity.":[14],"However,":[15],"analysis":[17,255,289,309,425],"biological":[19,43,80,107,246,274],"data":[21,33,81,158,300],"has,":[22],"among":[23],"many":[24],"others,":[25],"three":[26],"capital":[27],"challenges:":[28],"amount":[31,133],"generated":[32,146,293,324],"which":[34,97],"contains":[35],"technology-specific":[36],"errors":[37,77],"(that":[38],"can":[39,82,114,151,401],"be":[40,50,83,402],"mistaken":[41],"for":[42,93,167,212,311,404,421],"signals),":[44],"and":[45,73,176,198,275,285,407,415],"that":[46,174,346,400,423],"might":[47],"need":[48],"to":[49,54,79,101,124,194,201,221,241,271,281,337,372],"analyzed":[51],"without":[52,387],"access":[53],"reference":[56],"genome.":[57],"Through":[58,109],"use":[60],"locality":[62],"sensitive":[63],"hashing":[64,240],"methods,":[65],"both":[66,115],"efficient":[68],"similarity":[72,249],"tolerance":[74],"against":[75],"specific":[78,339,420],"achieved.":[84],"We":[85,207,265],"developed":[86,307,342,366],"variant":[88],"winnowing":[91],"algorithm":[92],"local":[94],"minimizer":[95,126],"computation,":[96],"is":[98],"specifically":[99],"geared":[100],"deal":[102],"with":[103,189],"repetitive":[104],"regions":[105],"within":[106],"sequences.":[108],"compressing":[110],"redundant":[111,135],"information,":[112],"we":[113,150,229,252,306,341,365,395],"reduce":[116,131],"size":[118,155],"hash":[121,164,178,184,205,236],"tables":[122],"required":[123,157],"save":[125],"sketches,":[127],"as":[128,130,160,162,297,299],"well":[129,161,298,331],"low":[136],"quality":[137,321,379],"alignment":[138,223,390],"candidates.":[139,224],"Analyzing":[140],"distribution":[142],"segment":[144],"lengths":[145],"by":[147,294,325,383],"this":[148,168,279,363],"approach,":[149],"better":[152,283],"judge":[153],"structures,":[159],"identify":[163,222],"functions":[165,188],"feasible":[166],"technique.":[169],"Our":[170],"evaluation":[171],"could":[172],"verify":[173],"simple":[175],"fast":[177],"functions,":[179],"even":[180],"when":[181],"using":[182,217,238],"small":[183,190],"value":[185],"spaces":[186],"(hash":[187],"codomain),":[191],"are":[192,335],"sufficient":[193],"compute":[195],"compressed":[196,219],"minimizers":[197],"perform":[199],"comparable":[200],"uniformly":[202],"randomly":[203],"chosen":[204],"values.":[206],"also":[208,351],"outlined":[209],"an":[210,308,389,393],"index":[211],"taxonomic":[214],"protein":[215],"database":[216],"multiple":[218],"winnowings":[220],"To":[225],"store":[226],"MinHash":[227],"values,":[228],"present":[230],"cache-optimized":[232],"implementation":[233],"table":[237],"Hopscotch":[239],"resolve":[242],"collisions.":[243],"As":[244,360,392],"application":[247],"based":[250,314],"analysis,":[251],"describe":[253],"double":[257],"digest":[258],"restriction":[259],"site":[260],"associated":[261],"DNA":[262],"sequencing":[263,385],"(ddRADseq).":[264],"implemented":[266],"simulation":[268],"software":[269],"able":[270,371],"model":[272],"technological":[276],"influences":[277],"technology":[280],"allow":[282],"development":[284],"testing":[286],"ddRADseq":[288,312,422],"software.":[290,318],"Using":[291],"datasets":[292],"our":[295],"software,":[296],"obtained":[301],"from":[302],"population":[303],"genetic":[304],"experiments,":[305],"workflow":[310,345],"data,":[313],"on":[315,329],"Stacks":[317,326],"Since":[319],"results":[323],"strongly":[327],"depends":[328],"how":[330],"used":[333,403],"parameters":[334],"adapted":[336],"dataset,":[340],"Snakemake":[344],"automates":[347],"preprocessing":[348],"tasks":[349],"while":[350,411],"allowing":[352],"automatic":[354],"exploration":[355],"different":[357],"parameter":[358],"sets.":[359],"part":[361],"workflow,":[364],"PCR":[368],"deduplication":[369],"approach":[370,399],"generate":[373],"consensus":[374],"reads":[375],"incorporating":[376],"base":[378],"values":[380],"(as":[381],"reported":[382],"device),":[386],"performing":[388],"first.":[391],"outlook,":[394],"outline":[396],"MinHashing":[398],"faster":[406],"more":[408],"robust":[409],"clustering,":[410],"addressing":[412],"incomplete":[413],"digestion":[414],"null":[416],"alleles,":[417],"two":[418],"effects":[419],"current":[424],"tools":[426],"cannot":[427],"reliably":[428],"detect.":[429]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
