{"id":"https://openalex.org/W4400921533","doi":"https://doi.org/10.1038/s42256-024-00872-0","title":"DNA language model GROVER learns sequence context in the human genome","display_name":"DNA language model GROVER learns sequence context in the human genome","publication_year":2024,"publication_date":"2024-07-23","ids":{"openalex":"https://openalex.org/W4400921533","doi":"https://doi.org/10.1038/s42256-024-00872-0"},"language":"en","primary_location":{"id":"doi:10.1038/s42256-024-00872-0","is_oa":true,"landing_page_url":"https://doi.org/10.1038/s42256-024-00872-0","pdf_url":"https://www.nature.com/articles/s42256-024-00872-0.pdf","source":{"id":"https://openalex.org/S2912241403","display_name":"Nature Machine Intelligence","issn_l":"2522-5839","issn":["2522-5839"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319908","host_organization_name":"Nature Portfolio","host_organization_lineage":["https://openalex.org/P4310319908","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Nature Portfolio","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Nature Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://www.nature.com/articles/s42256-024-00872-0.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5021041939","display_name":"Melissa Sanabria","orcid":"https://orcid.org/0000-0003-4345-1074"},"institutions":[{"id":"https://openalex.org/I4210148197","display_name":"Center for Systems Biology Dresden","ror":"https://ror.org/05hrn3e05","country_code":"DE","type":"facility","lineage":["https://openalex.org/I149899117","https://openalex.org/I149899117","https://openalex.org/I4210111875","https://openalex.org/I4210148197","https://openalex.org/I4210159854","https://openalex.org/I78650965"]},{"id":"https://openalex.org/I78650965","display_name":"Technische Universit\u00e4t Dresden","ror":"https://ror.org/042aqky30","country_code":"DE","type":"education","lineage":["https://openalex.org/I78650965"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Melissa Sanabria","raw_affiliation_strings":["Biomedical Genomics, Biotechnology Center, Center for Molecular and Cellular Bioengineering, Technische Universit\u00e4t, Dresden, Germany"],"raw_orcid":"https://orcid.org/0000-0003-4345-1074","affiliations":[{"raw_affiliation_string":"Biomedical Genomics, Biotechnology Center, Center for Molecular and Cellular Bioengineering, Technische Universit\u00e4t, Dresden, Germany","institution_ids":["https://openalex.org/I4210148197","https://openalex.org/I78650965"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057724103","display_name":"Jonas Hirsch","orcid":"https://orcid.org/0000-0003-2962-5963"},"institutions":[{"id":"https://openalex.org/I4210148197","display_name":"Center for Systems Biology Dresden","ror":"https://ror.org/05hrn3e05","country_code":"DE","type":"facility","lineage":["https://openalex.org/I149899117","https://openalex.org/I149899117","https://openalex.org/I4210111875","https://openalex.org/I4210148197","https://openalex.org/I4210159854","https://openalex.org/I78650965"]},{"id":"https://openalex.org/I78650965","display_name":"Technische Universit\u00e4t Dresden","ror":"https://ror.org/042aqky30","country_code":"DE","type":"education","lineage":["https://openalex.org/I78650965"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Jonas Hirsch","raw_affiliation_strings":["Biomedical Genomics, Biotechnology Center, Center for Molecular and Cellular Bioengineering, Technische Universit\u00e4t, Dresden, Germany"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Biomedical Genomics, Biotechnology Center, Center for Molecular and Cellular Bioengineering, Technische Universit\u00e4t, Dresden, Germany","institution_ids":["https://openalex.org/I4210148197","https://openalex.org/I78650965"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009354542","display_name":"Pierre M. Joubert","orcid":"https://orcid.org/0000-0001-7018-5213"},"institutions":[{"id":"https://openalex.org/I2801798921","display_name":"Helmholtz-Zentrum Dresden-Rossendorf","ror":"https://ror.org/01zy2cs03","country_code":"DE","type":"facility","lineage":["https://openalex.org/I1305996414","https://openalex.org/I2801798921"]},{"id":"https://openalex.org/I4210133756","display_name":"Center for Advanced Systems Understanding","ror":"https://ror.org/042b69396","country_code":"DE","type":"facility","lineage":["https://openalex.org/I1305996414","https://openalex.org/I2801798921","https://openalex.org/I4210133756"]},{"id":"https://openalex.org/I4210148197","display_name":"Center for Systems Biology Dresden","ror":"https://ror.org/05hrn3e05","country_code":"DE","type":"facility","lineage":["https://openalex.org/I149899117","https://openalex.org/I149899117","https://openalex.org/I4210111875","https://openalex.org/I4210148197","https://openalex.org/I4210159854","https://openalex.org/I78650965"]},{"id":"https://openalex.org/I78650965","display_name":"Technische Universit\u00e4t Dresden","ror":"https://ror.org/042aqky30","country_code":"DE","type":"education","lineage":["https://openalex.org/I78650965"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Pierre M. Joubert","raw_affiliation_strings":["Biomedical Genomics, Biotechnology Center, Center for Molecular and Cellular Bioengineering, Technische Universit\u00e4t, Dresden, Germany","Center for Advanced Systems Understanding (CASUS), G\u00f6rlitz, Germany","Helmholtz Zentrum Dresden-Rossendorf (HZDR), Dresden, Germany"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Biomedical Genomics, Biotechnology Center, Center for Molecular and Cellular Bioengineering, Technische Universit\u00e4t, Dresden, Germany","institution_ids":["https://openalex.org/I4210148197","https://openalex.org/I78650965"]},{"raw_affiliation_string":"Center for Advanced Systems Understanding (CASUS), G\u00f6rlitz, Germany","institution_ids":["https://openalex.org/I4210133756"]},{"raw_affiliation_string":"Helmholtz Zentrum Dresden-Rossendorf (HZDR), Dresden, Germany","institution_ids":["https://openalex.org/I2801798921"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5029443962","display_name":"Anna R. Poetsch","orcid":"https://orcid.org/0000-0003-3056-4360"},"institutions":[{"id":"https://openalex.org/I4210111460","display_name":"National Center for Tumor Diseases","ror":"https://ror.org/01txwsw02","country_code":"DE","type":"funder","lineage":["https://openalex.org/I4210111460"]},{"id":"https://openalex.org/I4210148197","display_name":"Center for Systems Biology Dresden","ror":"https://ror.org/05hrn3e05","country_code":"DE","type":"facility","lineage":["https://openalex.org/I149899117","https://openalex.org/I149899117","https://openalex.org/I4210111875","https://openalex.org/I4210148197","https://openalex.org/I4210159854","https://openalex.org/I78650965"]},{"id":"https://openalex.org/I4387153526","display_name":"Nationales Centrum f\u00fcr Tumorerkrankungen Dresden","ror":"https://ror.org/01zy07c70","country_code":"DE","type":"healthcare","lineage":["https://openalex.org/I1305996414","https://openalex.org/I1305996414","https://openalex.org/I17937529","https://openalex.org/I2801798921","https://openalex.org/I4210162051","https://openalex.org/I4387153526","https://openalex.org/I78650965"]},{"id":"https://openalex.org/I78650965","display_name":"Technische Universit\u00e4t Dresden","ror":"https://ror.org/042aqky30","country_code":"DE","type":"education","lineage":["https://openalex.org/I78650965"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Anna R. Poetsch","raw_affiliation_strings":["Biomedical Genomics, Biotechnology Center, Center for Molecular and Cellular Bioengineering, Technische Universit\u00e4t, Dresden, Germany","National Center for Tumor Diseases (NCT) partner site Dresden, German Cancer Research Center (DKFZ), Dresden, Germany"],"raw_orcid":"https://orcid.org/0000-0003-3056-4360","affiliations":[{"raw_affiliation_string":"Biomedical Genomics, Biotechnology Center, Center for Molecular and Cellular Bioengineering, Technische Universit\u00e4t, Dresden, Germany","institution_ids":["https://openalex.org/I4210148197","https://openalex.org/I78650965"]},{"raw_affiliation_string":"National Center for Tumor Diseases (NCT) partner site Dresden, German Cancer Research Center (DKFZ), Dresden, Germany","institution_ids":["https://openalex.org/I4210111460","https://openalex.org/I4387153526"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5021041939"],"corresponding_institution_ids":["https://openalex.org/I4210148197","https://openalex.org/I78650965"],"apc_list":{"value":9750,"currency":"EUR","value_usd":11690},"apc_paid":{"value":9750,"currency":"EUR","value_usd":11690},"fwci":17.9438,"has_fulltext":false,"cited_by_count":85,"citation_normalized_percentile":{"value":0.99642783,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"6","issue":"8","first_page":"911","last_page":"923"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10521","display_name":"RNA and protein synthesis mechanisms","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10521","display_name":"RNA and protein synthesis mechanisms","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/genome","display_name":"Genome","score":0.6506298780441284},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6484739780426025},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.6030176281929016},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.571023166179657},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.48473355174064636},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43340784311294556},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.42623016238212585},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4216078817844391},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.41775181889533997},{"id":"https://openalex.org/keywords/computational-biology","display_name":"Computational biology","score":0.36381441354751587},{"id":"https://openalex.org/keywords/genetics","display_name":"Genetics","score":0.27506566047668457},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.2608479857444763},{"id":"https://openalex.org/keywords/gene","display_name":"Gene","score":0.20124787092208862},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.13571414351463318}],"concepts":[{"id":"https://openalex.org/C141231307","wikidata":"https://www.wikidata.org/wiki/Q7020","display_name":"Genome","level":3,"score":0.6506298780441284},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6484739780426025},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.6030176281929016},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.571023166179657},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.48473355174064636},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43340784311294556},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.42623016238212585},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4216078817844391},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.41775181889533997},{"id":"https://openalex.org/C70721500","wikidata":"https://www.wikidata.org/wiki/Q177005","display_name":"Computational biology","level":1,"score":0.36381441354751587},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.27506566047668457},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.2608479857444763},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.20124787092208862},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.13571414351463318},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1038/s42256-024-00872-0","is_oa":true,"landing_page_url":"https://doi.org/10.1038/s42256-024-00872-0","pdf_url":"https://www.nature.com/articles/s42256-024-00872-0.pdf","source":{"id":"https://openalex.org/S2912241403","display_name":"Nature Machine Intelligence","issn_l":"2522-5839","issn":["2522-5839"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319908","host_organization_name":"Nature Portfolio","host_organization_lineage":["https://openalex.org/P4310319908","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Nature Portfolio","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Nature Machine Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1038/s42256-024-00872-0","is_oa":true,"landing_page_url":"https://doi.org/10.1038/s42256-024-00872-0","pdf_url":"https://www.nature.com/articles/s42256-024-00872-0.pdf","source":{"id":"https://openalex.org/S2912241403","display_name":"Nature Machine Intelligence","issn_l":"2522-5839","issn":["2522-5839"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319908","host_organization_name":"Nature Portfolio","host_organization_lineage":["https://openalex.org/P4310319908","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Nature Portfolio","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Nature Machine Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.8199999928474426,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4400921533.pdf"},"referenced_works_count":27,"referenced_works":["https://openalex.org/W96925928","https://openalex.org/W2000026908","https://openalex.org/W2008365039","https://openalex.org/W2036300762","https://openalex.org/W2093896423","https://openalex.org/W2107745473","https://openalex.org/W2164944827","https://openalex.org/W2168909179","https://openalex.org/W2259938310","https://openalex.org/W2896457183","https://openalex.org/W2928206577","https://openalex.org/W2962784628","https://openalex.org/W2988217457","https://openalex.org/W3018603462","https://openalex.org/W3127238141","https://openalex.org/W3203588026","https://openalex.org/W4280594315","https://openalex.org/W4316339774","https://openalex.org/W4382490702","https://openalex.org/W4382603228","https://openalex.org/W4384206353","https://openalex.org/W4393454277","https://openalex.org/W6739901393","https://openalex.org/W6778883912","https://openalex.org/W6912911859","https://openalex.org/W6931584888","https://openalex.org/W6950584518"],"related_works":["https://openalex.org/W2468279273","https://openalex.org/W2361861616","https://openalex.org/W2354198838","https://openalex.org/W1989130879","https://openalex.org/W2263699433","https://openalex.org/W2103419012","https://openalex.org/W2377979023","https://openalex.org/W2218034408","https://openalex.org/W2392921965","https://openalex.org/W2358755282"],"abstract_inverted_index":{"Abstract":[0],"Deep-learning":[1],"models":[2],"that":[3,96,168],"learn":[4],"a":[5,13,37,51,68,198,213],"sense":[6,199],"of":[7,16,36,39,78,135,147,155,161,182,219],"language":[8,29,53,203],"on":[9,18,45],"DNA":[10],"have":[11],"achieved":[12],"high":[14],"level":[15],"performance":[17],"genome":[19,48,83,178,183],"biological":[20],"tasks.":[21],"Genome":[22],"sequences":[23],"follow":[24],"rules":[25],"similar":[26],"to":[27,104,139,211],"natural":[28],"but":[30],"are":[31,112],"distinct":[32],"in":[33,80,115],"the":[34,46,64,81,86,118,123,152,159,166,217],"absence":[35],"concept":[38],"words.":[40],"We":[41],"established":[42],"byte-pair":[43],"encoding":[44],"human":[47,82],"and":[49,108,129,143,186,202],"trained":[50,97,133],"foundation":[52],"model":[54],"called":[55],"GROVER":[56,125,189,194],"(Genome":[57],"Rules":[58],"Obtained":[59],"Via":[60],"Extracted":[61],"Representations)":[62],"with":[63,180],"vocabulary":[65],"selected":[66],"via":[67],"custom":[69],"task,":[70],"next-":[71],"k":[72],"-mer":[73],"prediction.":[74],"The":[75],"defined":[76],"dictionary":[77],"tokens":[79,111],"carries":[84],"best":[85],"information":[87,102,162],"content":[88,107,163],"for":[89,200,216],"GROVER.":[90,173],"Analysing":[91],"learned":[92],"representations,":[93],"we":[94],"observed":[95],"token":[98],"embeddings":[99,134],"primarily":[100,113],"encode":[101],"related":[103],"frequency,":[105],"sequence":[106,167,196],"length.":[109],"Some":[110],"localized":[114],"repeats,":[116],"whereas":[117],"majority":[119],"widely":[120],"distribute":[121],"over":[122],"genome.":[124],"also":[126],"learns":[127,195],"context":[128],"lexical":[130],"ambiguity.":[131],"Average":[132],"genomic":[136],"regions":[137],"relate":[138],"functional":[140],"genomics":[141],"annotation":[142],"thus":[144],"indicate":[145],"learning":[146],"these":[148],"structures":[149],"purely":[150],"from":[151],"contextual":[153],"relationships":[154],"tokens.":[156],"This":[157],"highlights":[158],"extent":[160],"encoded":[164],"by":[165,172],"can":[169,208],"be":[170,209],"grasped":[171],"On":[174],"fine-tuning":[175],"tasks":[176],"addressing":[177],"biology":[179],"questions":[181],"element":[184],"identification":[185],"protein\u2013DNA":[187],"binding,":[188],"exceeds":[190],"other":[191],"models\u2019":[192],"performance.":[193],"context,":[197],"structure":[201],"rules.":[204],"Extracting":[205],"this":[206],"knowledge":[207],"used":[210],"compose":[212],"grammar":[214],"book":[215],"code":[218],"life.":[220]},"counts_by_year":[{"year":2026,"cited_by_count":22},{"year":2025,"cited_by_count":54},{"year":2024,"cited_by_count":9}],"updated_date":"2026-06-05T09:01:59.212387","created_date":"2025-10-10T00:00:00"}
