{"id":"https://openalex.org/W4402100089","doi":"https://doi.org/10.1093/bioinformatics/btae529","title":"Are genomic language models all you need? Exploring genomic language models on protein downstream tasks","display_name":"Are genomic language models all you need? Exploring genomic language models on protein downstream tasks","publication_year":2024,"publication_date":"2024-08-30","ids":{"openalex":"https://openalex.org/W4402100089","doi":"https://doi.org/10.1093/bioinformatics/btae529","pmid":"https://pubmed.ncbi.nlm.nih.gov/39212609"},"language":"en","primary_location":{"id":"doi:10.1093/bioinformatics/btae529","is_oa":true,"landing_page_url":"https://doi.org/10.1093/bioinformatics/btae529","pdf_url":"https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btae529/58972224/btae529.pdf","source":{"id":"https://openalex.org/S52395412","display_name":"Bioinformatics","issn_l":"1367-4803","issn":["1367-4803","1367-4811"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Bioinformatics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj","pubmed"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btae529/58972224/btae529.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5098751393","display_name":"Sam Boshar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sam Boshar","raw_affiliation_strings":["InstaDeep , Cambridge, MA 02142, United States"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"InstaDeep , Cambridge, MA 02142, United States","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023952414","display_name":"Evan Trop","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Evan Trop","raw_affiliation_strings":["InstaDeep , Cambridge, MA 02142, United States"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"InstaDeep , Cambridge, MA 02142, United States","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084912778","display_name":"Bernardo P. de Almeida","orcid":"https://orcid.org/0000-0002-6084-6775"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bernardo P de Almeida","raw_affiliation_strings":["InstaDeep , Paris 75010, France"],"raw_orcid":"https://orcid.org/0000-0002-6084-6775","affiliations":[{"raw_affiliation_string":"InstaDeep , Paris 75010, France","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003905510","display_name":"Liviu Copoiu","orcid":"https://orcid.org/0000-0001-9329-114X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liviu Copoiu","raw_affiliation_strings":["InstaDeep , London W2 1AY, United Kingdom"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"InstaDeep , London W2 1AY, United Kingdom","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5015141560","display_name":"Thomas Pierrot","orcid":"https://orcid.org/0000-0002-5227-6194"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Thomas Pierrot","raw_affiliation_strings":["InstaDeep , Cambridge, MA 02142, United States"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"InstaDeep , Cambridge, MA 02142, United States","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5015141560"],"corresponding_institution_ids":[],"apc_list":{"value":3618,"currency":"USD","value_usd":3618},"apc_paid":{"value":3618,"currency":"USD","value_usd":3618},"fwci":2.8221,"has_fulltext":true,"cited_by_count":14,"citation_normalized_percentile":{"value":0.91221937,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":"40","issue":"9","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.6013000011444092,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.6013000011444092,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10521","display_name":"RNA and protein synthesis mechanisms","score":0.07490000128746033,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11642","display_name":"Genomics and Rare Diseases","score":0.04859999939799309,"subfield":{"id":"https://openalex.org/subfields/1311","display_name":"Genetics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7438284158706665},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6955095529556274},{"id":"https://openalex.org/keywords/genomics","display_name":"Genomics","score":0.5528078079223633},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4589482545852661},{"id":"https://openalex.org/keywords/proteomics","display_name":"Proteomics","score":0.45829448103904724},{"id":"https://openalex.org/keywords/computational-biology","display_name":"Computational biology","score":0.4091053009033203},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.37122344970703125},{"id":"https://openalex.org/keywords/genome","display_name":"Genome","score":0.30689793825149536},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.19563117623329163},{"id":"https://openalex.org/keywords/genetics","display_name":"Genetics","score":0.09707382321357727},{"id":"https://openalex.org/keywords/gene","display_name":"Gene","score":0.09472134709358215}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7438284158706665},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6955095529556274},{"id":"https://openalex.org/C189206191","wikidata":"https://www.wikidata.org/wiki/Q222046","display_name":"Genomics","level":4,"score":0.5528078079223633},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4589482545852661},{"id":"https://openalex.org/C46111723","wikidata":"https://www.wikidata.org/wiki/Q471857","display_name":"Proteomics","level":3,"score":0.45829448103904724},{"id":"https://openalex.org/C70721500","wikidata":"https://www.wikidata.org/wiki/Q177005","display_name":"Computational biology","level":1,"score":0.4091053009033203},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.37122344970703125},{"id":"https://openalex.org/C141231307","wikidata":"https://www.wikidata.org/wiki/Q7020","display_name":"Genome","level":3,"score":0.30689793825149536},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.19563117623329163},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.09707382321357727},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.09472134709358215}],"mesh":[{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000378","qualifier_name":"metabolism","is_major_topic":false},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000378","qualifier_name":"metabolism","is_major_topic":false},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000378","qualifier_name":"metabolism","is_major_topic":false},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000378","qualifier_name":"metabolism","is_major_topic":false},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000378","qualifier_name":"metabolism","is_major_topic":false},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000737","qualifier_name":"chemistry","is_major_topic":false},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000737","qualifier_name":"chemistry","is_major_topic":false},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000737","qualifier_name":"chemistry","is_major_topic":false},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000737","qualifier_name":"chemistry","is_major_topic":false},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000737","qualifier_name":"chemistry","is_major_topic":false},{"descriptor_ui":"D023281","descriptor_name":"Genomics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D023281","descriptor_name":"Genomics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D023281","descriptor_name":"Genomics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D023281","descriptor_name":"Genomics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D023281","descriptor_name":"Genomics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D040901","descriptor_name":"Proteomics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D040901","descriptor_name":"Proteomics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D040901","descriptor_name":"Proteomics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D040901","descriptor_name":"Proteomics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D040901","descriptor_name":"Proteomics","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true}],"locations_count":3,"locations":[{"id":"doi:10.1093/bioinformatics/btae529","is_oa":true,"landing_page_url":"https://doi.org/10.1093/bioinformatics/btae529","pdf_url":"https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btae529/58972224/btae529.pdf","source":{"id":"https://openalex.org/S52395412","display_name":"Bioinformatics","issn_l":"1367-4803","issn":["1367-4803","1367-4811"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Bioinformatics","raw_type":"journal-article"},{"id":"pmid:39212609","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/39212609","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Bioinformatics (Oxford, England)","raw_type":null},{"id":"pmh:oai:pubmedcentral.nih.gov:11399231","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/11399231","pdf_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC11399231/pdf/btae529.pdf","source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Bioinformatics","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1093/bioinformatics/btae529","is_oa":true,"landing_page_url":"https://doi.org/10.1093/bioinformatics/btae529","pdf_url":"https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btae529/58972224/btae529.pdf","source":{"id":"https://openalex.org/S52395412","display_name":"Bioinformatics","issn_l":"1367-4803","issn":["1367-4803","1367-4811"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Bioinformatics","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.5600000023841858,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4402100089.pdf","grobid_xml":"https://content.openalex.org/works/W4402100089.grobid-xml"},"referenced_works_count":29,"referenced_works":["https://openalex.org/W2007807048","https://openalex.org/W2068628419","https://openalex.org/W2104972430","https://openalex.org/W2127435852","https://openalex.org/W2130479394","https://openalex.org/W2144893575","https://openalex.org/W2379594833","https://openalex.org/W2735621019","https://openalex.org/W2769882797","https://openalex.org/W2941112903","https://openalex.org/W2950374603","https://openalex.org/W3015921770","https://openalex.org/W3083874241","https://openalex.org/W3127238141","https://openalex.org/W3135682489","https://openalex.org/W3146944767","https://openalex.org/W3174395937","https://openalex.org/W3177500196","https://openalex.org/W3177828909","https://openalex.org/W3203588026","https://openalex.org/W3217031179","https://openalex.org/W4230640326","https://openalex.org/W4281993476","https://openalex.org/W4309506674","https://openalex.org/W4327550249","https://openalex.org/W4392095606","https://openalex.org/W6731867162","https://openalex.org/W6769627184","https://openalex.org/W6837789219"],"related_works":["https://openalex.org/W1488408180","https://openalex.org/W2360290178","https://openalex.org/W2035260723","https://openalex.org/W3005464483","https://openalex.org/W415785311","https://openalex.org/W2055243143","https://openalex.org/W3145527116","https://openalex.org/W2184120357","https://openalex.org/W2126088744","https://openalex.org/W2366576702"],"abstract_inverted_index":{"MOTIVATION:":[0],"Large":[1],"language":[2,30,97],"models,":[3],"trained":[4,171],"on":[5,54,112,187,193],"enormous":[6],"corpora":[7],"of":[8,52,93,155,198,214,220],"biological":[9],"sequences,":[10,44],"are":[11,104],"state-of-the-art":[12],"for":[13],"downstream":[14,38,167],"genomic":[15,29,162],"and":[16,86,95,106,210,223,228,242],"proteomic":[17,96],"tasks.":[18,114,195],"Since":[19],"the":[20,23,34,50,66,91,121,203,212,215,218],"genome":[21],"contains":[22],"information":[24],"to":[25,36,60,89,125,165,200,205,226],"encode":[26],"all":[27],"proteins,":[28],"models":[31,98],"(gLMs)":[32],"hold":[33],"potential":[35,204],"make":[37,234],"predictions":[39],"not":[40],"only":[41],"about":[42,47],"DNA":[43,68],"but":[45,145],"also":[46],"proteins.":[48],"However,":[49],"performance":[51,92,117,192],"gLMs":[53,94,103,199],"protein":[55,168,188],"tasks":[56,62,189],"remains":[57],"unknown,":[58],"due":[59],"few":[61],"pairing":[63],"proteins":[64],"with":[65,179],"coding":[67],"sequences":[69],"(CDS)":[70],"that":[71,102,130,141,182],"can":[72],"be":[73],"processed":[74],"by":[75],"gLMs.":[76],"RESULTS:":[77],"In":[78],"this":[79],"work,":[80],"we":[81,150,159],"curated":[82],"five":[83],"such":[84],"datasets":[85,243],"used":[87],"them":[88],"evaluate":[90],"(pLMs).":[99],"We":[100,128,170,233],"show":[101],"competitive":[105],"even":[107],"outperform":[108],"their":[109,156],"pLMs":[110],"counterparts":[111],"some":[113],"The":[115,196],"best":[116],"was":[118],"achieved":[119],"using":[120],"retrieved":[122],"CDS":[123,208],"compared":[124],"sampling":[126],"strategies.":[127],"found":[129],"training":[131],"a":[132,172,221],"joint":[133],"genomic-proteomic":[134],"model":[135,153,178,240],"outperforms":[136,183],"each":[137],"individual":[138],"approach,":[139],"showing":[140],"they":[142],"capture":[143],"different":[144,161],"complementary":[146],"sequence":[147],"representations,":[148],"as":[149],"demonstrate":[151],"through":[152],"interpretation":[154],"embeddings.":[157],"Lastly,":[158],"explored":[160],"tokenization":[163,181],"schemes":[164],"improve":[166],"performance.":[169],"new":[173],"Nucleotide":[174],"Transformer":[175],"(50M)":[176],"foundation":[177],"3mer":[180,238],"its":[184],"6mer":[185],"counterpart":[186],"while":[190],"maintaining":[191],"genomics":[194,227],"application":[197],"proteomics":[201],"offers":[202],"leverage":[206],"rich":[207],"data,":[209],"in":[211],"spirit":[213],"central":[216],"dogma,":[217],"possibility":[219],"unified":[222],"synergistic":[224],"approach":[225],"proteomics.":[229],"AVAILABILITY":[230],"AND":[231],"IMPLEMENTATION:":[232],"our":[235],"inference":[236],"code,":[237],"pre-trained":[239],"weights":[241],"available.":[244]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":10},{"year":2024,"cited_by_count":1}],"updated_date":"2026-06-16T07:32:37.131356","created_date":"2025-10-10T00:00:00"}
