{"id":"https://openalex.org/W7153990007","doi":"https://doi.org/10.48550/arxiv.2604.08698","title":"EvoLen: Evolution-Guided Tokenization for DNA Language Model","display_name":"EvoLen: Evolution-Guided Tokenization for DNA Language Model","publication_year":2026,"publication_date":"2026-04-09","ids":{"openalex":"https://openalex.org/W7153990007","doi":"https://doi.org/10.48550/arxiv.2604.08698"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.08698","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08698","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.08698","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133546474","display_name":"Nan Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Nan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121113183","display_name":"Xiaoxiao Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Xiaoxiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133524734","display_name":"Junxia Cui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cui, Junxia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133528522","display_name":"Mario Tapia-Pacheco","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tapia-Pacheco, Mario","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028223846","display_name":"Tiffany Amariuta","orcid":"https://orcid.org/0000-0003-0121-1726"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Amariuta, Tiffany","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133542292","display_name":"Yang Li","orcid":"https://orcid.org/0009-0008-3409-0936"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5039500313","display_name":"Jingbo Shang","orcid":"https://orcid.org/0000-0002-7249-4404"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shang, Jingbo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12029","display_name":"DNA and Biological Computing","score":0.5415999889373779,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T12029","display_name":"DNA and Biological Computing","score":0.5415999889373779,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10222","display_name":"Genomics and Chromatin Dynamics","score":0.18639999628067017,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.03999999910593033,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.7840999960899353},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.4814000129699707},{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.47850000858306885},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.44699999690055847},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.412200003862381},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.41110000014305115},{"id":"https://openalex.org/keywords/evolutionary-algorithm","display_name":"Evolutionary algorithm","score":0.3481999933719635},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.32899999618530273}],"concepts":[{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.7840999960899353},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7199000120162964},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4927999973297119},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.4814000129699707},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.47850000858306885},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.44699999690055847},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.435699999332428},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.412200003862381},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.41110000014305115},{"id":"https://openalex.org/C159149176","wikidata":"https://www.wikidata.org/wiki/Q14489129","display_name":"Evolutionary algorithm","level":2,"score":0.3481999933719635},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.32899999618530273},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3179999887943268},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.30489999055862427},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.2921000123023987},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.28679999709129333},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.2842999994754791},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C121835503","wikidata":"https://www.wikidata.org/wiki/Q2596288","display_name":"Evolutionary programming","level":3,"score":0.26809999346733093},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C105902424","wikidata":"https://www.wikidata.org/wiki/Q1197129","display_name":"Evolutionary computation","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.25690001249313354},{"id":"https://openalex.org/C51679486","wikidata":"https://www.wikidata.org/wiki/Q380546","display_name":"DNA sequencing","level":3,"score":0.25540000200271606}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.08698","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08698","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.08698","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08698","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Tokens":[0],"serve":[1],"as":[2],"the":[3,101,139,160],"basic":[4],"units":[5],"of":[6,162],"representation":[7],"in":[8],"DNA":[9,21,59,75,129],"language":[10],"models":[11],"(DNALMs),":[12],"yet":[13],"their":[14],"design":[15],"remains":[16],"underexplored.":[17],"Unlike":[18],"natural":[19],"language,":[20],"lacks":[22],"inherent":[23],"token":[24,52],"boundaries":[25],"or":[26,177],"predefined":[27],"compositional":[28],"rules,":[29],"making":[30],"tokenization":[31,76,102,189],"a":[32,38,106,143,191],"fundamental":[33],"modeling":[34],"decision":[35],"rather":[36,68],"than":[37,69],"naturally":[39],"specified":[40],"one.":[41],"While":[42],"existing":[43],"approaches":[44],"like":[45,82],"byte-pair":[46],"encoding":[47],"(BPE)":[48],"excel":[49],"at":[50],"capturing":[51],"structures":[53],"that":[54,74,108,188,196],"reflect":[55],"human-generated":[56],"linguistic":[57,70],"regularities,":[58],"is":[60],"organized":[61],"by":[62],"biological":[63],"function":[64],"and":[65,90,148,170,195,204],"evolutionary":[66,88,97,110,125,173,198],"constraint":[67,89],"convention.":[71],"We":[72,95],"argue":[73],"should":[77],"prioritize":[78],"functional":[79,119,163],"sequence":[80,120,164,206],"patterns":[81],"regulatory":[83],"motifs-short,":[84],"recurring":[85],"segments":[86],"under":[87],"typically":[91],"preserved":[92,146],"across":[93,167,181],"species.":[94],"incorporate":[96],"information":[98,199],"directly":[99],"into":[100],"process":[103],"through":[104],"EvoLen,":[105],"tokenizer":[107],"combines":[109],"stratification":[111],"with":[112,152,172],"length-aware":[113,150],"decoding":[114,151],"to":[115,127],"better":[116],"preserve":[117],"motif-scale":[118],"units.":[121],"EvoLen":[122,158],"uses":[123],"cross-species":[124],"signals":[126],"group":[128],"sequences,":[130],"trains":[131],"separate":[132],"BPE":[133,180],"tokenizers":[134],"on":[135],"each":[136],"group,":[137],"merges":[138],"resulting":[140],"vocabularies":[141],"via":[142],"rule":[144],"prioritizing":[145],"patterns,":[147,165],"applies":[149],"dynamic":[153],"programming.":[154],"Through":[155],"controlled":[156],"experiments,":[157],"improves":[159],"preservation":[161],"differentiation":[166],"genomic":[168],"contexts,":[169],"alignment":[171],"constraint,":[174],"while":[175],"matching":[176],"outperforming":[178],"standard":[179],"diverse":[182],"DNALM":[183],"benchmarks.":[184],"These":[185],"results":[186],"demonstrate":[187],"introduces":[190],"critical":[192],"inductive":[193],"bias":[194],"incorporating":[197],"yields":[200],"more":[201],"biologically":[202],"meaningful":[203],"interpretable":[205],"representations.":[207]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-14T00:00:00"}
