{"id":"https://openalex.org/W4220691725","doi":"https://doi.org/10.1371/journal.pcbi.1009492","title":"Constructing benchmark test sets for biological sequence analysis using independent set algorithms","display_name":"Constructing benchmark test sets for biological sequence analysis using independent set algorithms","publication_year":2022,"publication_date":"2022-03-07","ids":{"openalex":"https://openalex.org/W4220691725","doi":"https://doi.org/10.1371/journal.pcbi.1009492","pmid":"https://pubmed.ncbi.nlm.nih.gov/35255082"},"language":"en","primary_location":{"id":"doi:10.1371/journal.pcbi.1009492","is_oa":true,"landing_page_url":"https://doi.org/10.1371/journal.pcbi.1009492","pdf_url":null,"source":{"id":"https://openalex.org/S86033158","display_name":"PLoS Computational Biology","issn_l":"1553-734X","issn":["1553-734X","1553-7358"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310315706","host_organization_name":"Public Library of Science","host_organization_lineage":["https://openalex.org/P4310315706"],"host_organization_lineage_names":["Public Library of Science"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"PLOS Computational Biology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj","pubmed"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1371/journal.pcbi.1009492","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5056525617","display_name":"Samantha Petti","orcid":"https://orcid.org/0000-0001-8281-8161"},"institutions":[{"id":"https://openalex.org/I136199984","display_name":"Harvard University","ror":"https://ror.org/03vek6s52","country_code":"US","type":"education","lineage":["https://openalex.org/I136199984"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Samantha Petti","raw_affiliation_strings":["NSF-Simons Center for the Mathematical and Statistical Analysis of Biology, Harvard University, Cambridge, Massachusetts, United States of America"],"raw_orcid":"https://orcid.org/0000-0001-8281-8161","affiliations":[{"raw_affiliation_string":"NSF-Simons Center for the Mathematical and Statistical Analysis of Biology, Harvard University, Cambridge, Massachusetts, United States of America","institution_ids":["https://openalex.org/I136199984"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5020118397","display_name":"Sean R. Eddy","orcid":"https://orcid.org/0000-0001-6676-4706"},"institutions":[{"id":"https://openalex.org/I1344073410","display_name":"Howard Hughes Medical Institute","ror":"https://ror.org/006w34k90","country_code":"US","type":"facility","lineage":["https://openalex.org/I1344073410"]},{"id":"https://openalex.org/I136199984","display_name":"Harvard University","ror":"https://ror.org/03vek6s52","country_code":"US","type":"education","lineage":["https://openalex.org/I136199984"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Sean R. Eddy","raw_affiliation_strings":["Howard Hughes Medical Institute; Department of Molecular & Cellular Biology; and John A. Paulson School of Engineering and Applied Sciences, Harvard University, Cambridge, Massachusetts, United States of America","John A. Paulson School of Engineering and Applied Sciences, Harvard University, Cambridge, Massachusetts, United States of America","Howard Hughes Medical Institute","Department of Molecular & Cellular Biology"],"raw_orcid":"https://orcid.org/0000-0001-6676-4706","affiliations":[{"raw_affiliation_string":"Howard Hughes Medical Institute; Department of Molecular & Cellular Biology; and John A. Paulson School of Engineering and Applied Sciences, Harvard University, Cambridge, Massachusetts, United States of America","institution_ids":["https://openalex.org/I1344073410"]},{"raw_affiliation_string":"John A. Paulson School of Engineering and Applied Sciences, Harvard University, Cambridge, Massachusetts, United States of America","institution_ids":["https://openalex.org/I136199984"]},{"raw_affiliation_string":"Howard Hughes Medical Institute","institution_ids":["https://openalex.org/I1344073410"]},{"raw_affiliation_string":"Department of Molecular & Cellular Biology","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5020118397"],"corresponding_institution_ids":["https://openalex.org/I1344073410","https://openalex.org/I136199984"],"apc_list":{"value":2655,"currency":"USD","value_usd":2655},"apc_paid":{"value":2655,"currency":"USD","value_usd":2655},"fwci":1.8227,"has_fulltext":false,"cited_by_count":21,"citation_normalized_percentile":{"value":0.8524964,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":"18","issue":"3","first_page":"e1009492","last_page":"e1009492"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10222","display_name":"Genomics and Chromatin Dynamics","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10521","display_name":"RNA and protein synthesis mechanisms","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.7462998628616333},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6984658241271973},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.6263496279716492},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5532934069633484},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.5339032411575317},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5284005999565125},{"id":"https://openalex.org/keywords/test-set","display_name":"Test set","score":0.5104981660842896},{"id":"https://openalex.org/keywords/alignment-free-sequence-analysis","display_name":"Alignment-free sequence analysis","score":0.4883422553539276},{"id":"https://openalex.org/keywords/sequence-analysis","display_name":"Sequence analysis","score":0.46159693598747253},{"id":"https://openalex.org/keywords/multiple-sequence-alignment","display_name":"Multiple sequence alignment","score":0.45234501361846924},{"id":"https://openalex.org/keywords/sequence-alignment","display_name":"Sequence alignment","score":0.4468016028404236},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4174458980560303},{"id":"https://openalex.org/keywords/test","display_name":"Test (biology)","score":0.41317975521087646},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.3349360525608063},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.29530417919158936},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.2304670512676239},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.1757468581199646},{"id":"https://openalex.org/keywords/genetics","display_name":"Genetics","score":0.11462387442588806},{"id":"https://openalex.org/keywords/peptide-sequence","display_name":"Peptide sequence","score":0.10968229174613953}],"concepts":[{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.7462998628616333},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6984658241271973},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.6263496279716492},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5532934069633484},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.5339032411575317},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5284005999565125},{"id":"https://openalex.org/C169903167","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Test set","level":2,"score":0.5104981660842896},{"id":"https://openalex.org/C180384323","wikidata":"https://www.wikidata.org/wiki/Q16335137","display_name":"Alignment-free sequence analysis","level":5,"score":0.4883422553539276},{"id":"https://openalex.org/C61053724","wikidata":"https://www.wikidata.org/wiki/Q1154615","display_name":"Sequence analysis","level":3,"score":0.46159693598747253},{"id":"https://openalex.org/C88031987","wikidata":"https://www.wikidata.org/wiki/Q1377767","display_name":"Multiple sequence alignment","level":5,"score":0.45234501361846924},{"id":"https://openalex.org/C45484198","wikidata":"https://www.wikidata.org/wiki/Q827246","display_name":"Sequence alignment","level":4,"score":0.4468016028404236},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4174458980560303},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.41317975521087646},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3349360525608063},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.29530417919158936},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2304670512676239},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.1757468581199646},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.11462387442588806},{"id":"https://openalex.org/C167625842","wikidata":"https://www.wikidata.org/wiki/Q899763","display_name":"Peptide sequence","level":3,"score":0.10968229174613953},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C162853370","wikidata":"https://www.wikidata.org/wiki/Q39809","display_name":"Marketing","level":1,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0}],"mesh":[{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D017421","descriptor_name":"Sequence Analysis","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D017421","descriptor_name":"Sequence Analysis","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D017421","descriptor_name":"Sequence Analysis","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D019985","descriptor_name":"Benchmarking","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true}],"locations_count":5,"locations":[{"id":"doi:10.1371/journal.pcbi.1009492","is_oa":true,"landing_page_url":"https://doi.org/10.1371/journal.pcbi.1009492","pdf_url":null,"source":{"id":"https://openalex.org/S86033158","display_name":"PLoS Computational Biology","issn_l":"1553-734X","issn":["1553-734X","1553-7358"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310315706","host_organization_name":"Public Library of Science","host_organization_lineage":["https://openalex.org/P4310315706"],"host_organization_lineage_names":["Public Library of Science"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"PLOS Computational Biology","raw_type":"journal-article"},{"id":"pmid:35255082","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/35255082","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"PLoS computational biology","raw_type":null},{"id":"pmh:oai:RePEc:plo:pcbi00:1009492","is_oa":false,"landing_page_url":"https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1009492","pdf_url":null,"source":{"id":"https://openalex.org/S4306401271","display_name":"RePEc: Research Papers in Economics","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I77793887","host_organization_name":"Federal Reserve Bank of St. Louis","host_organization_lineage":["https://openalex.org/I77793887"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},{"id":"pmh:oai:doaj.org/article:18c1a0210b59456abeb5f34c4f02fe4f","is_oa":true,"landing_page_url":"https://doaj.org/article/18c1a0210b59456abeb5f34c4f02fe4f","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"PLoS Computational Biology, Vol 18, Iss 3, p e1009492 (2022)","raw_type":"article"},{"id":"pmh:oai:pubmedcentral.nih.gov:8929697","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/8929697","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"PLoS Comput Biol","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1371/journal.pcbi.1009492","is_oa":true,"landing_page_url":"https://doi.org/10.1371/journal.pcbi.1009492","pdf_url":null,"source":{"id":"https://openalex.org/S86033158","display_name":"PLoS Computational Biology","issn_l":"1553-734X","issn":["1553-734X","1553-7358"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310315706","host_organization_name":"Public Library of Science","host_organization_lineage":["https://openalex.org/P4310315706"],"host_organization_lineage_names":["Public Library of Science"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"PLOS Computational Biology","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G411805036","display_name":null,"funder_award_id":"R01-HG009116","funder_id":"https://openalex.org/F4320332161","funder_display_name":"National Institutes of Health"},{"id":"https://openalex.org/G4241104939","display_name":null,"funder_award_id":"R01-HG009116","funder_id":"https://openalex.org/F4320337348","funder_display_name":"National Human Genome Research Institute"},{"id":"https://openalex.org/G4969458960","display_name":null,"funder_award_id":"1764269","funder_id":"https://openalex.org/F4320309622","funder_display_name":"Harvard University"},{"id":"https://openalex.org/G8091342872","display_name":null,"funder_award_id":"1764269","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320309622","display_name":"Harvard University","ror":"https://ror.org/03vek6s52"},{"id":"https://openalex.org/F4320332161","display_name":"National Institutes of Health","ror":"https://ror.org/01cwqze88"},{"id":"https://openalex.org/F4320337348","display_name":"National Human Genome Research Institute","ror":"https://ror.org/00baak391"},{"id":"https://openalex.org/F4320338027","display_name":"FAS Division of Science, Harvard University","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W2045204781","https://openalex.org/W2086204534","https://openalex.org/W2121918723","https://openalex.org/W2137597742","https://openalex.org/W2138122982","https://openalex.org/W2149652607","https://openalex.org/W2150853465","https://openalex.org/W2151690061","https://openalex.org/W2158714788","https://openalex.org/W2185628700","https://openalex.org/W2401610261","https://openalex.org/W2521298622","https://openalex.org/W2898402099","https://openalex.org/W2943203634","https://openalex.org/W2975578922","https://openalex.org/W2980789587","https://openalex.org/W2997234557","https://openalex.org/W3025885307","https://openalex.org/W3090528923","https://openalex.org/W3095583226","https://openalex.org/W3107611715","https://openalex.org/W3145289798","https://openalex.org/W3146944767","https://openalex.org/W3186118520","https://openalex.org/W4213112325","https://openalex.org/W6740005241"],"related_works":["https://openalex.org/W2051969447","https://openalex.org/W2133116680","https://openalex.org/W2111937814","https://openalex.org/W2141411672","https://openalex.org/W1482324242","https://openalex.org/W187239587","https://openalex.org/W2730968108","https://openalex.org/W4386770652","https://openalex.org/W2158700816","https://openalex.org/W1525517788"],"abstract_inverted_index":{"Biological":[0],"sequence":[1,37,75,87,97],"families":[2,113],"contain":[3],"many":[4],"sequences":[5,50],"that":[6,51],"are":[7,15,52],"very":[8],"similar":[9],"to":[10,58,103],"each":[11,95],"other":[12],"because":[13,45],"they":[14],"related":[16,54],"by":[17],"evolution,":[18],"so":[19],"the":[20],"strategy":[21],"for":[22,73],"splitting":[23,74],"data":[24,76],"into":[25,77],"separate":[26],"training":[27,59,79,106],"and":[28,80,89],"test":[29,49,81,96],"sets":[30],"is":[31,43,98],"a":[32,86,91,115],"nontrivial":[33],"choice":[34],"in":[35,93],"benchmarking":[36],"analysis":[38],"methods.":[39],"A":[40],"random":[41],"split":[42,92,111],"insufficient":[44],"it":[46],"will":[47],"yield":[48],"closely":[53],"or":[55],"even":[56],"identical":[57,102],"sequences.":[60],"Adapting":[61],"ideas":[62],"from":[63],"independent":[64],"set":[65],"graph":[66],"algorithms,":[67],"we":[68],"describe":[69],"two":[70],"new":[71],"methods":[72],"dissimilar":[78],"sets.":[82],"These":[83,108],"algorithms":[84,109],"input":[85],"family":[88],"produce":[90],"which":[94],"less":[99],"than":[100,114],"p%":[101],"any":[104],"individual":[105],"sequence.":[107],"successfully":[110],"more":[112,121],"previous":[116],"approach,":[117],"enabling":[118],"construction":[119],"of":[120],"diverse":[122],"benchmark":[123],"datasets.":[124]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":7},{"year":2022,"cited_by_count":4}],"updated_date":"2026-01-13T01:12:25.745995","created_date":"2025-10-10T00:00:00"}
