{"id":"https://openalex.org/W7134190355","doi":"https://doi.org/10.1093/bib/bbag095","title":"<i>De novo</i> functional protein sequence generation: overcoming data scarcity through regeneration and large language models","display_name":"<i>De novo</i> functional protein sequence generation: overcoming data scarcity through regeneration and large language models","publication_year":2026,"publication_date":"2026-02-11","ids":{"openalex":"https://openalex.org/W7134190355","doi":"https://doi.org/10.1093/bib/bbag095","pmid":"https://pubmed.ncbi.nlm.nih.gov/41795657"},"language":"en","primary_location":{"id":"doi:10.1093/bib/bbag095","is_oa":true,"landing_page_url":"https://doi.org/10.1093/bib/bbag095","pdf_url":null,"source":{"id":"https://openalex.org/S91767247","display_name":"Briefings in Bioinformatics","issn_l":"1467-5463","issn":["1467-5463","1477-4054"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Briefings in Bioinformatics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj","pubmed"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1093/bib/bbag095","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5122868270","display_name":"Chenyu Ren","orcid":null},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Chenyu Ren","raw_affiliation_strings":["Department of Applied Mathematics , The Hong Kong Polytechnic University, Hung Hom, Kowloon, Hong Kong,"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Applied Mathematics , The Hong Kong Polytechnic University, Hung Hom, Kowloon, Hong Kong,","institution_ids":["https://openalex.org/I14243506"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128337789","display_name":"Daihai He","orcid":null},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Daihai He","raw_affiliation_strings":["Department of Applied Mathematics , The Hong Kong Polytechnic University, Hung Hom, Kowloon, Hong Kong,"],"raw_orcid":"https://orcid.org/0000-0003-3253-654X","affiliations":[{"raw_affiliation_string":"Department of Applied Mathematics , The Hong Kong Polytechnic University, Hung Hom, Kowloon, Hong Kong,","institution_ids":["https://openalex.org/I14243506"]}]},{"author_position":"last","author":{"id":null,"display_name":"Jian Huang","orcid":"https://orcid.org/0000-0002-5218-9269"},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Jian Huang","raw_affiliation_strings":["Department of Applied Mathematics , The Hong Kong Polytechnic University, Hung Hom, Kowloon, Hong Kong,","Department of Data Science and AI , The Hong Kong Polytechnic University, Hung Hom, Kowloon, Hong Kong,"],"raw_orcid":"https://orcid.org/0000-0002-5218-9269","affiliations":[{"raw_affiliation_string":"Department of Applied Mathematics , The Hong Kong Polytechnic University, Hung Hom, Kowloon, Hong Kong,","institution_ids":["https://openalex.org/I14243506"]},{"raw_affiliation_string":"Department of Data Science and AI , The Hong Kong Polytechnic University, Hung Hom, Kowloon, Hong Kong,","institution_ids":["https://openalex.org/I14243506"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5122868270"],"corresponding_institution_ids":["https://openalex.org/I14243506"],"apc_list":{"value":4011,"currency":"USD","value_usd":4011},"apc_paid":{"value":4011,"currency":"USD","value_usd":4011},"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.50851037,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"27","issue":"2","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.5494999885559082,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.5494999885559082,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10044","display_name":"Protein Structure and Dynamics","score":0.15800000727176666,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10521","display_name":"RNA and protein synthesis mechanisms","score":0.03480000048875809,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.722599983215332},{"id":"https://openalex.org/keywords/protein-sequencing","display_name":"Protein sequencing","score":0.6460999846458435},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5490000247955322},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4683000147342682},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.4675000011920929},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.38659998774528503},{"id":"https://openalex.org/keywords/sequence-logo","display_name":"Sequence logo","score":0.38449999690055847},{"id":"https://openalex.org/keywords/peptide-sequence","display_name":"Peptide sequence","score":0.3702999949455261},{"id":"https://openalex.org/keywords/protein-design","display_name":"Protein design","score":0.3637999892234802}],"concepts":[{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.722599983215332},{"id":"https://openalex.org/C10010492","wikidata":"https://www.wikidata.org/wiki/Q3142557","display_name":"Protein sequencing","level":4,"score":0.6460999846458435},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.569100022315979},{"id":"https://openalex.org/C70721500","wikidata":"https://www.wikidata.org/wiki/Q177005","display_name":"Computational biology","level":1,"score":0.5583999752998352},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5490000247955322},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4683000147342682},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.4675000011920929},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.38659998774528503},{"id":"https://openalex.org/C105082737","wikidata":"https://www.wikidata.org/wiki/Q7452470","display_name":"Sequence logo","level":5,"score":0.38449999690055847},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.3797999918460846},{"id":"https://openalex.org/C167625842","wikidata":"https://www.wikidata.org/wiki/Q899763","display_name":"Peptide sequence","level":3,"score":0.3702999949455261},{"id":"https://openalex.org/C152769699","wikidata":"https://www.wikidata.org/wiki/Q410814","display_name":"Protein design","level":3,"score":0.3637999892234802},{"id":"https://openalex.org/C171897839","wikidata":"https://www.wikidata.org/wiki/Q417841","display_name":"Protein family","level":3,"score":0.3555000126361847},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3546000123023987},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.35370001196861267},{"id":"https://openalex.org/C40506919","wikidata":"https://www.wikidata.org/wiki/Q7452469","display_name":"Sequence learning","level":2,"score":0.3452000021934509},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.3450999855995178},{"id":"https://openalex.org/C45484198","wikidata":"https://www.wikidata.org/wiki/Q827246","display_name":"Sequence alignment","level":4,"score":0.3294999897480011},{"id":"https://openalex.org/C45475804","wikidata":"https://www.wikidata.org/wiki/Q6675846","display_name":"Loop modeling","level":4,"score":0.3151000142097473},{"id":"https://openalex.org/C47701112","wikidata":"https://www.wikidata.org/wiki/Q735188","display_name":"Protein structure","level":2,"score":0.30329999327659607},{"id":"https://openalex.org/C61053724","wikidata":"https://www.wikidata.org/wiki/Q1154615","display_name":"Sequence analysis","level":3,"score":0.2985999882221222},{"id":"https://openalex.org/C60644358","wikidata":"https://www.wikidata.org/wiki/Q128570","display_name":"Bioinformatics","level":1,"score":0.2743000090122223},{"id":"https://openalex.org/C30711495","wikidata":"https://www.wikidata.org/wiki/Q289411","display_name":"Sequence space","level":3,"score":0.262800008058548},{"id":"https://openalex.org/C136475424","wikidata":"https://www.wikidata.org/wiki/Q7251500","display_name":"Protein structure database","level":4,"score":0.26190000772476196},{"id":"https://openalex.org/C11804247","wikidata":"https://www.wikidata.org/wiki/Q896177","display_name":"Protein\u2013protein interaction","level":2,"score":0.2533000111579895},{"id":"https://openalex.org/C111364199","wikidata":"https://www.wikidata.org/wiki/Q2915896","display_name":"Protein methods","level":4,"score":0.2531999945640564},{"id":"https://openalex.org/C18051474","wikidata":"https://www.wikidata.org/wiki/Q899656","display_name":"Protein structure prediction","level":3,"score":0.25119999051094055}],"mesh":[{"descriptor_ui":"D000098342","descriptor_name":"Large Language Models","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000098342","descriptor_name":"Large Language Models","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000465","descriptor_name":"Algorithms","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000595","descriptor_name":"Amino Acid Sequence","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D000595","descriptor_name":"Amino Acid Sequence","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000235","qualifier_name":"genetics","is_major_topic":true},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000235","qualifier_name":"genetics","is_major_topic":true},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000737","qualifier_name":"chemistry","is_major_topic":true},{"descriptor_ui":"D011506","descriptor_name":"Proteins","qualifier_ui":"Q000737","qualifier_name":"chemistry","is_major_topic":true},{"descriptor_ui":"D016415","descriptor_name":"Sequence Alignment","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D016415","descriptor_name":"Sequence Alignment","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D019295","descriptor_name":"Computational Biology","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D019295","descriptor_name":"Computational Biology","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D020539","descriptor_name":"Sequence Analysis, Protein","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D020539","descriptor_name":"Sequence Analysis, Protein","qualifier_ui":"Q000379","qualifier_name":"methods","is_major_topic":true},{"descriptor_ui":"D030562","descriptor_name":"Databases, Protein","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D030562","descriptor_name":"Databases, Protein","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false}],"locations_count":3,"locations":[{"id":"doi:10.1093/bib/bbag095","is_oa":true,"landing_page_url":"https://doi.org/10.1093/bib/bbag095","pdf_url":null,"source":{"id":"https://openalex.org/S91767247","display_name":"Briefings in Bioinformatics","issn_l":"1467-5463","issn":["1467-5463","1477-4054"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Briefings in Bioinformatics","raw_type":"journal-article"},{"id":"pmid:41795657","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41795657","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Briefings in bioinformatics","raw_type":null},{"id":"pmh:oai:pubmedcentral.nih.gov:12967336","is_oa":true,"landing_page_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC12967336/","pdf_url":null,"source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Brief Bioinform","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1093/bib/bbag095","is_oa":true,"landing_page_url":"https://doi.org/10.1093/bib/bbag095","pdf_url":null,"source":{"id":"https://openalex.org/S91767247","display_name":"Briefings in Bioinformatics","issn_l":"1467-5463","issn":["1467-5463","1477-4054"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Briefings in Bioinformatics","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1258512577","display_name":null,"funder_award_id":"P0046811","funder_id":"https://openalex.org/F4320322598","funder_display_name":"Hong Kong Polytechnic University"}],"funders":[{"id":"https://openalex.org/F4320322598","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W1979762151","https://openalex.org/W2017784829","https://openalex.org/W2077834282","https://openalex.org/W2086806574","https://openalex.org/W2114850508","https://openalex.org/W2127322768","https://openalex.org/W2340987618","https://openalex.org/W2519539312","https://openalex.org/W2588983348","https://openalex.org/W2809879025","https://openalex.org/W3025792695","https://openalex.org/W3135130381","https://openalex.org/W3136918052","https://openalex.org/W3137550543","https://openalex.org/W3177828909","https://openalex.org/W4205773061","https://openalex.org/W4281898027","https://openalex.org/W4286500588","https://openalex.org/W4318071656","https://openalex.org/W4323350175"],"related_works":[],"abstract_inverted_index":{"Proteins":[0],"are":[1],"essential":[2],"components":[3],"of":[4,21,35,66,78,113],"all":[5],"living":[6],"organisms":[7],"and":[8,139,152,171],"play":[9],"a":[10,18,44,92,111,114,125],"critical":[11],"role":[12],"in":[13,47,52],"cellular":[14],"survival.":[15],"They":[16],"have":[17,56,130],"broad":[19],"range":[20],"applications,":[22],"from":[23,143],"clinical":[24],"treatments":[25],"to":[26,167],"material":[27],"engineering.":[28],"This":[29],"versatility":[30],"has":[31],"spurred":[32],"the":[33,48,64,76,141,168,174],"development":[34],"protein":[36,60,68,101,115,120,127,137,162,189],"design,":[37],"with":[38,173],"amino":[39],"acid":[40],"sequence":[41,61,69,121,147,190],"design":[42],"being":[43],"crucial":[45],"step":[46],"process.":[49],"Recent":[50],"advancements":[51],"deep":[53],"generative":[54,186],"models":[55,187],"shown":[57],"promise":[58],"for":[59,71,188],"design.":[62],"However,":[63],"scarcity":[65],"functional":[67,100,126,136],"data":[70],"certain":[72],"types":[73],"can":[74,98],"hinder":[75],"training":[77],"these":[79],"models,":[80,122],"which":[81],"often":[82],"require":[83],"large":[84,119],"datasets.":[85,106],"To":[86],"address":[87],"this":[88],"challenge,":[89],"we":[90],"propose":[91],"hierarchical":[93],"model":[94,133,179],"named":[95],"ProteinRG":[96,107],"that":[97,159],"generate":[99],"sequences":[102,138,163,170],"using":[103],"relatively":[104],"small":[105],"begins":[108],"by":[109],"generating":[110],"representation":[112],"sequence,":[116],"leveraging":[117],"existing":[118],"before":[123],"producing":[124],"sequence.":[128],"We":[129],"tested":[131],"our":[132,160,178],"on":[134],"various":[135],"evaluated":[140],"results":[142],"three":[144],"perspectives:":[145],"multiple":[146],"alignment,":[148],"t-SNE":[149],"distribution":[150],"analysis,":[151],"3D":[153],"structure":[154],"prediction.":[155],"The":[156],"findings":[157],"indicate":[158],"generated":[161],"maintain":[164],"both":[165],"similarity":[166],"original":[169],"consistency":[172],"desired":[175],"functions.":[176],"Moreover,":[177],"demonstrates":[180],"superior":[181],"performance":[182],"compared":[183],"twith":[184],"other":[185],"generation.":[191]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2026-03-09T00:00:00"}
