{"id":"https://openalex.org/W7135091315","doi":"https://doi.org/10.48550/arxiv.2603.10302","title":"How to make the most of your masked language model for protein engineering","display_name":"How to make the most of your masked language model for protein engineering","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7135091315","doi":"https://doi.org/10.48550/arxiv.2603.10302"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.10302","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10302","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.10302","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063847354","display_name":"Calvin McCarter","orcid":"https://orcid.org/0000-0002-7257-1350"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"McCarter, Calvin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128891415","display_name":"Nick Bhattacharya","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhattacharya, Nick","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057921753","display_name":"Sebastian W. Ober","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ober, Sebastian W.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128883386","display_name":"Hunter Elliott","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Elliott, Hunter","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11016","display_name":"Monoclonal and Polyclonal Antibodies Research","score":0.916100025177002,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11016","display_name":"Monoclonal and Polyclonal Antibodies Research","score":0.916100025177002,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T12576","display_name":"vaccines and immunoinformatics approaches","score":0.029500000178813934,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11124","display_name":"Protein purification and stability","score":0.026900000870227814,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cognitive-reframing","display_name":"Cognitive reframing","score":0.5949000120162964},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.5864999890327454},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.5066999793052673},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3982999920845032},{"id":"https://openalex.org/keywords/in-silico","display_name":"In silico","score":0.34709998965263367},{"id":"https://openalex.org/keywords/natural-language-generation","display_name":"Natural language generation","score":0.3190999925136566},{"id":"https://openalex.org/keywords/modeling-language","display_name":"Modeling language","score":0.29750001430511475},{"id":"https://openalex.org/keywords/statistical-model","display_name":"Statistical model","score":0.2874999940395355}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7337999939918518},{"id":"https://openalex.org/C187029079","wikidata":"https://www.wikidata.org/wiki/Q958679","display_name":"Cognitive reframing","level":2,"score":0.5949000120162964},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.5864999890327454},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.5066999793052673},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4348999857902527},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.41350001096725464},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4052000045776367},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3982999920845032},{"id":"https://openalex.org/C2775905019","wikidata":"https://www.wikidata.org/wiki/Q192572","display_name":"In silico","level":3,"score":0.34709998965263367},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.3190999925136566},{"id":"https://openalex.org/C179603123","wikidata":"https://www.wikidata.org/wiki/Q1941921","display_name":"Modeling language","level":3,"score":0.29750001430511475},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.2874999940395355},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2759999930858612},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C170130773","wikidata":"https://www.wikidata.org/wiki/Q216378","display_name":"Usability","level":2,"score":0.2694999873638153},{"id":"https://openalex.org/C141218545","wikidata":"https://www.wikidata.org/wiki/Q7521336","display_name":"Simulation language","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C2984391234","wikidata":"https://www.wikidata.org/wiki/Q195771","display_name":"Sequential sampling","level":3,"score":0.267300009727478},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2637999951839447},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2549999952316284},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.25200000405311584}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.10302","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10302","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.10302","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10302","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"A":[0],"plethora":[1],"of":[2,83,88,95,125],"protein":[3],"language":[4,42],"models":[5,43,49],"have":[6],"been":[7],"released":[8],"in":[9,53,56,93,112],"recent":[10],"years.":[11],"Yet":[12],"comparatively":[13],"little":[14],"work":[15],"has":[16],"addressed":[17],"how":[18],"to":[19,24],"best":[20],"sample":[21],"from":[22,109],"them":[23],"optimize":[25],"desired":[26],"biological":[27],"properties.":[28],"We":[29],"fill":[30],"this":[31,141],"gap":[32],"by":[33,46],"proposing":[34],"a":[35,89],"flexible,":[36],"effective":[37],"sampling":[38,66,126],"method":[39,127],"for":[40,116],"masked":[41],"(MLMs),":[44],"and":[45,50,55],"systematically":[47],"evaluating":[48,80],"methods":[51],"both":[52],"silico":[54],"vitro":[57,113],"on":[58],"actual":[59],"antibody":[60,118],"therapeutics":[61],"campaigns.":[62],"Firstly,":[63],"we":[64,106],"propose":[65],"with":[67,101],"stochastic":[68],"beam":[69],"search,":[70],"exploiting":[71],"the":[72,81,84,117,134],"fact":[73],"that":[74,123],"MLMs":[75],"are":[76],"remarkably":[77],"efficient":[78],"at":[79,129],"pseudo-perplexity":[82],"entire":[85],"1-edit":[86],"neighborhood":[87],"sequence.":[90],"Reframing":[91],"generation":[92],"terms":[94],"entire-sequence":[96],"evaluation":[97,115],"enables":[98],"flexible":[99],"guidance":[100],"multiple":[102],"optimization":[103],"objectives.":[104],"Secondly,":[105],"report":[107],"results":[108],"our":[110],"extensive":[111],"head-to-head":[114],"engineering":[119],"setting.":[120],"This":[121],"reveals":[122],"choice":[124],"is":[128],"least":[130],"as":[131,133],"impactful":[132],"model":[135],"used,":[136],"motivating":[137],"future":[138],"research":[139],"into":[140],"under-explored":[142],"area.":[143]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-13T00:00:00"}
