{"id":"https://openalex.org/W4410589755","doi":"https://doi.org/10.1007/s44163-025-00304-x","title":"Protein sequence classification using natural language processing techniques","display_name":"Protein sequence classification using natural language processing techniques","publication_year":2025,"publication_date":"2025-05-22","ids":{"openalex":"https://openalex.org/W4410589755","doi":"https://doi.org/10.1007/s44163-025-00304-x"},"language":"en","primary_location":{"id":"doi:10.1007/s44163-025-00304-x","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s44163-025-00304-x","pdf_url":"https://link.springer.com/content/pdf/10.1007/s44163-025-00304-x.pdf","source":{"id":"https://openalex.org/S4210220416","display_name":"Discover Artificial Intelligence","issn_l":"2731-0809","issn":["2731-0809"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319965","host_organization_name":"Springer Nature","host_organization_lineage":["https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Discover Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://link.springer.com/content/pdf/10.1007/s44163-025-00304-x.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106675346","display_name":"Huma Perveen","orcid":null},"institutions":[{"id":"https://openalex.org/I162608824","display_name":"University of Sussex","ror":"https://ror.org/00ayhx656","country_code":"GB","type":"education","lineage":["https://openalex.org/I162608824"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Huma Perveen","raw_affiliation_strings":["School of Mathematical and Physical Sciences, University of Sussex, Brighton, UK"],"affiliations":[{"raw_affiliation_string":"School of Mathematical and Physical Sciences, University of Sussex, Brighton, UK","institution_ids":["https://openalex.org/I162608824"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016270907","display_name":"Julie Weeds","orcid":"https://orcid.org/0000-0002-3831-4019"},"institutions":[{"id":"https://openalex.org/I162608824","display_name":"University of Sussex","ror":"https://ror.org/00ayhx656","country_code":"GB","type":"education","lineage":["https://openalex.org/I162608824"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Julie Weeds","raw_affiliation_strings":["School of Engineering and Informatics, University of Sussex, Brighton, UK"],"affiliations":[{"raw_affiliation_string":"School of Engineering and Informatics, University of Sussex, Brighton, UK","institution_ids":["https://openalex.org/I162608824"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5106675346"],"corresponding_institution_ids":["https://openalex.org/I162608824"],"apc_list":{"value":990,"currency":"EUR","value_usd":1067},"apc_paid":{"value":990,"currency":"EUR","value_usd":1067},"fwci":1.5162,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.82001436,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":"5","issue":"1","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.9909999966621399,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10521","display_name":"RNA and protein synthesis mechanisms","score":0.9819999933242798,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.5775091052055359},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5638461112976074},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5525522232055664},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4492175579071045},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.13976013660430908},{"id":"https://openalex.org/keywords/genetics","display_name":"Genetics","score":0.05337300896644592}],"concepts":[{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.5775091052055359},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5638461112976074},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5525522232055664},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4492175579071045},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.13976013660430908},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.05337300896644592}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1007/s44163-025-00304-x","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s44163-025-00304-x","pdf_url":"https://link.springer.com/content/pdf/10.1007/s44163-025-00304-x.pdf","source":{"id":"https://openalex.org/S4210220416","display_name":"Discover Artificial Intelligence","issn_l":"2731-0809","issn":["2731-0809"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319965","host_organization_name":"Springer Nature","host_organization_lineage":["https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Discover Artificial Intelligence","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:893722680ac5435584be8e314ffe6e81","is_oa":true,"landing_page_url":"https://doaj.org/article/893722680ac5435584be8e314ffe6e81","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Discover Artificial Intelligence, Vol 5, Iss 1, Pp 1-25 (2025)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1007/s44163-025-00304-x","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s44163-025-00304-x","pdf_url":"https://link.springer.com/content/pdf/10.1007/s44163-025-00304-x.pdf","source":{"id":"https://openalex.org/S4210220416","display_name":"Discover Artificial Intelligence","issn_l":"2731-0809","issn":["2731-0809"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319965","host_organization_name":"Springer Nature","host_organization_lineage":["https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Nature"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Discover Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4410589755.pdf","grobid_xml":"https://content.openalex.org/works/W4410589755.grobid-xml"},"referenced_works_count":38,"referenced_works":["https://openalex.org/W1965827393","https://openalex.org/W1981712987","https://openalex.org/W2010870240","https://openalex.org/W2016966401","https://openalex.org/W2026480276","https://openalex.org/W2045993561","https://openalex.org/W2056670722","https://openalex.org/W2070071798","https://openalex.org/W2087070363","https://openalex.org/W2095131443","https://openalex.org/W2111072639","https://openalex.org/W2127154291","https://openalex.org/W2134445963","https://openalex.org/W2140767933","https://openalex.org/W2256578114","https://openalex.org/W2283968319","https://openalex.org/W2507669447","https://openalex.org/W2752689404","https://openalex.org/W2781086737","https://openalex.org/W2807818025","https://openalex.org/W2891465508","https://openalex.org/W2943203634","https://openalex.org/W3005869346","https://openalex.org/W3011262577","https://openalex.org/W3087252502","https://openalex.org/W3102846393","https://openalex.org/W3146944767","https://openalex.org/W3177500196","https://openalex.org/W4205773061","https://openalex.org/W4379876549","https://openalex.org/W4386083759","https://openalex.org/W4387729411","https://openalex.org/W4392906754","https://openalex.org/W4394853710","https://openalex.org/W4401455507","https://openalex.org/W4402665960","https://openalex.org/W4403548533","https://openalex.org/W6763868836"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W3204019825"],"abstract_inverted_index":{"This":[0],"study":[1,52],"aimed":[2],"to":[3,223],"enhance":[4],"protein":[5,187],"sequence":[6,19,101,166,194],"classification":[7,169],"using":[8,95,159],"natural":[9],"language":[10],"processing":[11],"(NLP)":[12],"techniques":[13],"while":[14,134],"addressing":[15],"the":[16,116,163,203],"impact":[17,164],"of":[18,165,205],"similarity":[20,167,195],"on":[21,106,168],"model":[22],"performance.":[23,170,201],"We":[24],"compared":[25],"various":[26],"machine":[27],"learning":[28,31],"and":[29,40,74,85,90,100,126,143,180,193,221],"deep":[30],"models":[32,54,87,156,182],"under":[33,131],"two":[34],"different":[35,96],"data-splitting":[36],"strategies:":[37],"random":[38,132],"splitting":[39,208],"ECOD":[41,212],"family-based":[42,213],"splitting,":[43,133,161,214],"which":[44],"ensures":[45],"evolutionary-related":[46],"sequences":[47],"are":[48],"grouped":[49],"together.":[50],"The":[51,112],"evaluated":[53],"such":[55,210],"as":[56,211],"K-Nearest":[57],"Neighbors":[58],"(KNN),":[59],"Multinomial":[60],"Na\u00efve":[61],"Bayes,":[62],"Logistic":[63],"Regression,":[64],"Multi-Layer":[65],"Perceptron":[66],"(MLP),":[67],"Decision":[68],"Tree,":[69],"Random":[70],"Forest,":[71],"XGBoost,":[72],"Voting":[73,113,178],"Stacking":[75],"classifiers,":[76,179],"Convolutional":[77],"Neural":[78],"Network":[79],"(CNN),":[80],"Long":[81],"Short-Term":[82],"Memory":[83],"(LSTM),":[84],"transformer":[86,149,181],"(BertForSequenceClassification,":[88],"DistilBERT,":[89],"ProtBert).":[91],"Performance":[92],"was":[93],"tested":[94,158],"amino":[97],"acid":[98],"ranges":[99],"lengths":[102],"with":[103,119,189],"a":[104],"focus":[105],"generalization":[107,222],"across":[108,154],"unseen":[109,224],"evolutionary":[110,225],"families.":[111,226],"classifier":[114],"achieved":[115],"highest":[117],"performance":[118,152,219],"74%":[120,122],"accuracy,":[121,138],"weighted":[123,140],"F1":[124,129,141,146],"score,":[125,142],"65%":[127],"macro":[128,145],"score":[130,147],"ProtBERT":[135],"obtained":[136],"77%":[137],"76%":[139],"61%":[144],"among":[148],"models.":[150],"However,":[151,202],"declined":[153],"all":[155],"when":[157],"ECOD-based":[160],"revealing":[162],"Advanced":[171],"NLP":[172],"techniques,":[173],"particularly":[174],"ensemble":[175],"methods":[176],"like":[177],"show":[183],"significant":[184],"potential":[185],"in":[186],"classification,":[188],"sufficient":[190],"training":[191],"data":[192],"management":[196],"being":[197],"crucial":[198,216],"for":[199,217],"optimal":[200],"use":[204],"biologically":[206],"meaningful":[207],"methods,":[209],"is":[215],"realistic":[218],"evaluation":[220]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-11T08:14:18.477133","created_date":"2025-10-10T00:00:00"}
