{"id":"https://openalex.org/W7106685747","doi":"https://doi.org/10.48550/arxiv.2511.17573","title":"Binary BPE: A Family of Cross-Platform Tokenizers for Binary Analysis","display_name":"Binary BPE: A Family of Cross-Platform Tokenizers for Binary Analysis","publication_year":2025,"publication_date":"2025-11-14","ids":{"openalex":"https://openalex.org/W7106685747","doi":"https://doi.org/10.48550/arxiv.2511.17573"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2511.17573","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.17573","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2511.17573","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Bommarito, Michael J.","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bommarito, Michael J.","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.8996999859809875,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.8996999859809875,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11424","display_name":"Security and Verification in Computing","score":0.04340000078082085,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12034","display_name":"Digital and Cyber Forensics","score":0.008799999952316284,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.6636000275611877},{"id":"https://openalex.org/keywords/binary-number","display_name":"Binary number","score":0.661899983882904},{"id":"https://openalex.org/keywords/byte","display_name":"Byte","score":0.5335000157356262},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.46950000524520874},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.44449999928474426},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.4027000069618225},{"id":"https://openalex.org/keywords/binary-code","display_name":"Binary code","score":0.39419999718666077},{"id":"https://openalex.org/keywords/malware","display_name":"Malware","score":0.3643999993801117}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7462000250816345},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.6636000275611877},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.661899983882904},{"id":"https://openalex.org/C43364308","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Byte","level":2,"score":0.5335000157356262},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.46950000524520874},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.44449999928474426},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.4027000069618225},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3962000012397766},{"id":"https://openalex.org/C63435697","wikidata":"https://www.wikidata.org/wiki/Q864135","display_name":"Binary code","level":3,"score":0.39419999718666077},{"id":"https://openalex.org/C541664917","wikidata":"https://www.wikidata.org/wiki/Q14001","display_name":"Malware","level":2,"score":0.3643999993801117},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.35409998893737793},{"id":"https://openalex.org/C162478608","wikidata":"https://www.wikidata.org/wiki/Q4011369","display_name":"Uncompressed video","level":4,"score":0.34119999408721924},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.33390000462532043},{"id":"https://openalex.org/C500551929","wikidata":"https://www.wikidata.org/wiki/Q8819","display_name":"Unicode","level":2,"score":0.289000004529953},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.27549999952316284},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C2779190172","wikidata":"https://www.wikidata.org/wiki/Q4913888","display_name":"Binary data","level":3,"score":0.27129998803138733},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2669000029563904},{"id":"https://openalex.org/C102392041","wikidata":"https://www.wikidata.org/wiki/Q592860","display_name":"Sliding window protocol","level":3,"score":0.26649999618530273},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2621999979019165},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.25929999351501465},{"id":"https://openalex.org/C19407854","wikidata":"https://www.wikidata.org/wiki/Q485","display_name":"Computer virus","level":2,"score":0.25589999556541443},{"id":"https://openalex.org/C128040838","wikidata":"https://www.wikidata.org/wiki/Q1810628","display_name":"Pseudorandom binary sequence","level":3,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2511.17573","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.17573","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2511.17573","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.17573","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Sequence":[0],"models":[1,187],"for":[2,17,54,140,161,184],"binary":[3,144],"analysis":[4],"are":[5],"bottlenecked":[6],"by":[7],"byte-level":[8],"tokenization:":[9],"raw":[10,152],"bytes":[11],"waste":[12],"precious":[13],"context":[14,149],"window":[15,150],"capacity":[16],"transformers":[18],"and":[19,24,67,75,89,97,158,168,188],"other":[20],"neural":[21],"network":[22],"architectures,":[23,66],"many":[25],"existing":[26],"text-oriented":[27],"tokenizers":[28,53,81,108,137,176],"fail":[29],"on":[30,57,177],"arbitrary":[31],"0x00--0xFF":[32],"sequences.":[33],"To":[34],"address":[35],"this":[36],"issue,":[37],"we":[38],"introduce":[39],"the":[40,134,172],"Binary":[41,135,174],"BPE":[42,136,175],"tokenizer":[43],"family,":[44],"a":[45,58,180],"set":[46],"of":[47,61,84],"cross-platform":[48,116],"Byte":[49],"Pair":[50],"Encoding":[51],"(BPE)":[52],"executables":[55,127],"trained":[56,80,173],"large":[59],"corpus":[60],"binaries":[62],"spanning":[63],"multiple":[64],"platforms,":[65],"operating":[68],"systems,":[69],"including":[70],"Linux,":[71],"Windows,":[72],"macOS,":[73],"Android,":[74],"malware":[76,164],"sources.":[77],"We":[78,170],"release":[79,171],"with":[82],"vocabularies":[83],"4K,":[85],"8K,":[86],"16K,":[87],"32K,":[88],"64K":[90],"tokens,":[91],"enabling":[92,154],"both":[93],"systematic":[94],"scaling":[95],"studies":[96],"practical":[98,159],"deployment":[99,160],"from":[100],"resource-constrained":[101],"edge":[102],"devices":[103],"to":[104],"high-throughput":[105],"datacenters.":[106],"These":[107],"discover":[109],"interpretable":[110],"patterns":[111],"(ELF/PE":[112],"headers,":[113],"instruction":[114],"sequences,":[115],"strings)":[117],"while":[118],"yielding":[119],"multi-byte":[120],"compression":[121],"per":[122,146],"token.":[123],"On":[124],"representative":[125],"uncompressed":[126],"(e.g.,":[128],"ELF/PE/Mach-O":[129],"rather":[130],"than":[131,151],"compressed":[132],"APKs),":[133],"typically":[138],"allow":[139],"roughly":[141],"2-3x":[142],"more":[143,155],"content":[145,162],"fixed-length":[147],"transformer":[148],"bytes,":[153],"efficient":[156],"research":[157],"identification,":[163],"detection,":[165],"reverse":[166],"engineering,":[167],"optimization.":[169],"HuggingFace,":[178],"providing":[179],"drop-in,":[181],"open-source":[182],"foundation":[183],"binary-focused":[185],"language":[186],"context-efficient":[189],"agentic":[190],"tools.":[191]},"counts_by_year":[],"updated_date":"2025-11-27T01:16:37.896743","created_date":"2025-11-27T00:00:00"}
