{"id":"https://openalex.org/W4416016512","doi":"https://doi.org/10.1145/3746252.3761551","title":"Zipf-Gramming: Scaling Byte N-Grams Up to Production Sized Malware Corpora","display_name":"Zipf-Gramming: Scaling Byte N-Grams Up to Production Sized Malware Corpora","publication_year":2025,"publication_date":"2025-11-08","ids":{"openalex":"https://openalex.org/W4416016512","doi":"https://doi.org/10.1145/3746252.3761551"},"language":null,"primary_location":{"id":"doi:10.1145/3746252.3761551","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761551","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746252.3761551","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5068036546","display_name":"Edward Raff","orcid":"https://orcid.org/0000-0002-9900-1972"},"institutions":[{"id":"https://openalex.org/I2799992204","display_name":"Crowder College","ror":"https://ror.org/04t99en88","country_code":"US","type":"education","lineage":["https://openalex.org/I2799992204"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Edward Raff","raw_affiliation_strings":["CrowdStrike, Austin, TX, USA"],"raw_orcid":"https://orcid.org/0000-0002-9900-1972","affiliations":[{"raw_affiliation_string":"CrowdStrike, Austin, TX, USA","institution_ids":["https://openalex.org/I2799992204"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026833192","display_name":"Ryan R. Curtin","orcid":"https://orcid.org/0000-0002-9903-8214"},"institutions":[{"id":"https://openalex.org/I1322124587","display_name":"Booz Allen Hamilton (United States)","ror":"https://ror.org/051rcp357","country_code":"US","type":"company","lineage":["https://openalex.org/I1322124587"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ryan R. Curtin","raw_affiliation_strings":["Booz Allen Hamilton, McLean, VA, USA"],"raw_orcid":"https://orcid.org/0000-0002-9903-8214","affiliations":[{"raw_affiliation_string":"Booz Allen Hamilton, McLean, VA, USA","institution_ids":["https://openalex.org/I1322124587"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080234803","display_name":"D. Everett","orcid":"https://orcid.org/0000-0003-3593-5255"},"institutions":[{"id":"https://openalex.org/I1322124587","display_name":"Booz Allen Hamilton (United States)","ror":"https://ror.org/051rcp357","country_code":"US","type":"company","lineage":["https://openalex.org/I1322124587"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Derek Everett","raw_affiliation_strings":["Booz Allen Hamilton, McLean, VA, USA"],"raw_orcid":"https://orcid.org/0000-0003-3593-5255","affiliations":[{"raw_affiliation_string":"Booz Allen Hamilton, McLean, VA, USA","institution_ids":["https://openalex.org/I1322124587"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008388322","display_name":"Robert J. Joyce","orcid":"https://orcid.org/0009-0003-7168-1237"},"institutions":[{"id":"https://openalex.org/I1322124587","display_name":"Booz Allen Hamilton (United States)","ror":"https://ror.org/051rcp357","country_code":"US","type":"company","lineage":["https://openalex.org/I1322124587"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Robert J. Joyce","raw_affiliation_strings":["Booz Allen Hamilton, McLean, VA, USA"],"raw_orcid":"https://orcid.org/0009-0003-7168-1237","affiliations":[{"raw_affiliation_string":"Booz Allen Hamilton, McLean, VA, USA","institution_ids":["https://openalex.org/I1322124587"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102861683","display_name":"James Holt","orcid":"https://orcid.org/0000-0002-6368-8696"},"institutions":[{"id":"https://openalex.org/I4210113003","display_name":"Physical Sciences (United States)","ror":"https://ror.org/021qvjc46","country_code":"US","type":"company","lineage":["https://openalex.org/I4210113003"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"James Holt","raw_affiliation_strings":["Laboratory for Physical Sciences, College Park, MD, USA"],"raw_orcid":"https://orcid.org/0000-0002-6368-8696","affiliations":[{"raw_affiliation_string":"Laboratory for Physical Sciences, College Park, MD, USA","institution_ids":["https://openalex.org/I4210113003"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35977366,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"5988","last_page":"5996"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.8920999765396118,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.8920999765396118,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.040699999779462814,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.03579999879002571,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/terabyte","display_name":"Terabyte","score":0.7396000027656555},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.7128999829292297},{"id":"https://openalex.org/keywords/byte","display_name":"Byte","score":0.644599974155426},{"id":"https://openalex.org/keywords/malware","display_name":"Malware","score":0.5788999795913696},{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.5123999714851379},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.4925999939441681},{"id":"https://openalex.org/keywords/bigram","display_name":"Bigram","score":0.4715000092983246},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.45410001277923584}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8677999973297119},{"id":"https://openalex.org/C199683683","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Terabyte","level":2,"score":0.7396000027656555},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.7128999829292297},{"id":"https://openalex.org/C43364308","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Byte","level":2,"score":0.644599974155426},{"id":"https://openalex.org/C541664917","wikidata":"https://www.wikidata.org/wiki/Q14001","display_name":"Malware","level":2,"score":0.5788999795913696},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.5123999714851379},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.4925999939441681},{"id":"https://openalex.org/C108757681","wikidata":"https://www.wikidata.org/wiki/Q2773912","display_name":"Bigram","level":3,"score":0.4715000092983246},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.45410001277923584},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.45170000195503235},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.41290000081062317},{"id":"https://openalex.org/C125932096","wikidata":"https://www.wikidata.org/wiki/Q205472","display_name":"Zipf's law","level":2,"score":0.39570000767707825},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3765000104904175},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.3594000041484833},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32690000534057617},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.30570000410079956},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C117978034","wikidata":"https://www.wikidata.org/wiki/Q5422192","display_name":"Extractor","level":2,"score":0.2849999964237213},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.2741999924182892},{"id":"https://openalex.org/C81669768","wikidata":"https://www.wikidata.org/wiki/Q2359161","display_name":"Precision and recall","level":2,"score":0.2502000033855438}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3746252.3761551","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761551","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2511.13808","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.13808","pdf_url":"https://arxiv.org/pdf/2511.13808","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3746252.3761551","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746252.3761551","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 34th ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1893133781","https://openalex.org/W1986157381","https://openalex.org/W1989015215","https://openalex.org/W2000042664","https://openalex.org/W2011998344","https://openalex.org/W2018175892","https://openalex.org/W2057253402","https://openalex.org/W2062021443","https://openalex.org/W2064274762","https://openalex.org/W2114515438","https://openalex.org/W2170529403","https://openalex.org/W2220253305","https://openalex.org/W2518866423","https://openalex.org/W2792663539","https://openalex.org/W2806076636","https://openalex.org/W2894779647","https://openalex.org/W2900633536","https://openalex.org/W2911716162","https://openalex.org/W2913229995","https://openalex.org/W2946595319","https://openalex.org/W2950627632","https://openalex.org/W2964090036","https://openalex.org/W3007809473","https://openalex.org/W3084078945","https://openalex.org/W3123017167","https://openalex.org/W3157781335","https://openalex.org/W3200455702","https://openalex.org/W3206660056","https://openalex.org/W4281388232","https://openalex.org/W4385877224","https://openalex.org/W4386831175","https://openalex.org/W4388867297","https://openalex.org/W4388886856","https://openalex.org/W4400518663","https://openalex.org/W4406459472","https://openalex.org/W4407209053","https://openalex.org/W4412876938"],"related_works":[],"abstract_inverted_index":{"A":[0],"classifier":[1],"using":[2],"byte":[3],"n-grams":[4,74],"as":[5],"features":[6],"is":[7,102],"the":[8,47,65,70,81,86,108,149,156],"only":[9],"approach":[10,146],"we":[11,90,117],"have":[12,55],"found":[13,42],"fast":[14],"enough":[15],"to":[16,58,64,94,104,120,130,163],"meet":[17],"requirements":[18],"in":[19,34,133],"size":[20],"(sub":[21,29],"2":[22],"MB),":[23],"speed":[24],"(multiple":[25],"GB/s),":[26],"and":[27,127,142,155,160],"latency":[28],"10":[30],"ms)":[31],"for":[32],"deployment":[33],"numerous":[35],"malware":[36],"detection":[37],"scenarios.":[38],"However,":[39],"we've":[40],"consistently":[41],"that":[43,101,144],"6-8":[44],"grams":[45],"achieve":[46,164],"best":[48,110],"accuracy":[49],"on":[50],"our":[51,113,123,145],"production":[52,124],"deployments":[53],"but":[54],"been":[56],"unable":[57],"deploy":[59],"regularly":[60],"updated":[61],"models":[62,85],"due":[63],"high":[66],"cost":[67],"of":[68,77,88],"finding":[69],"top-k":[71,98,150],"most":[72],"frequent":[73],"over":[75],"terabytes":[76],"executable":[78],"programs.":[79],"Because":[80],"Zipfian":[82],"distribution":[83,87],"well":[84],"n-grams,":[89],"exploit":[91],"its":[92],"properties":[93],"develop":[95],"a":[96],"new":[97,114,137],"n-gram":[99],"extractor":[100],"up":[103,122,129],"35\u00d7":[105],"faster":[106],"than":[107],"previous":[109],"alternative.":[111],"Using":[112],"Zipf-Gramming":[115],"algorithm,":[116],"are":[118],"able":[119],"scale":[121],"training":[125],"set":[126],"obtain":[128],"30%":[131],"improvement":[132],"AUC":[134],"at":[135],"detecting":[136],"malware.":[138],"We":[139],"show":[140],"theoretically":[141],"empirically":[143],"will":[147],"select":[148],"items":[151],"with":[152],"little":[153],"error":[154],"interplay":[157],"between":[158],"theory":[159],"engineering":[161],"required":[162],"these":[165],"results.":[166]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-11-08T00:00:00"}
