{"id":"https://openalex.org/W4386831175","doi":"https://doi.org/10.1145/3624567","title":"Semi-Supervised Classification of Malware Families Under Extreme Class Imbalance via Hierarchical Non-Negative Matrix Factorization with Automatic Model Selection","display_name":"Semi-Supervised Classification of Malware Families Under Extreme Class Imbalance via Hierarchical Non-Negative Matrix Factorization with Automatic Model Selection","publication_year":2023,"publication_date":"2023-09-18","ids":{"openalex":"https://openalex.org/W4386831175","doi":"https://doi.org/10.1145/3624567"},"language":"en","primary_location":{"id":"doi:10.1145/3624567","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3624567","pdf_url":null,"source":{"id":"https://openalex.org/S4210174050","display_name":"ACM Transactions on Privacy and Security","issn_l":"2471-2566","issn":["2471-2566","2471-2574"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Privacy and Security","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5066261993","display_name":"Maksim E. Eren","orcid":"https://orcid.org/0000-0002-4362-0256"},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Maksim E. Eren","raw_affiliation_strings":["Advanced Research in Cyber Systems, Los Alamos National Laboratory, USA"],"raw_orcid":"https://orcid.org/0000-0002-4362-0256","affiliations":[{"raw_affiliation_string":"Advanced Research in Cyber Systems, Los Alamos National Laboratory, USA","institution_ids":["https://openalex.org/I1343871089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012614331","display_name":"Manish Bhattarai","orcid":"https://orcid.org/0000-0002-1421-3643"},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Manish Bhattarai","raw_affiliation_strings":["Theoretical Division, Los Alamos National Laboratory, USA"],"raw_orcid":"https://orcid.org/0000-0002-1421-3643","affiliations":[{"raw_affiliation_string":"Theoretical Division, Los Alamos National Laboratory, USA","institution_ids":["https://openalex.org/I1343871089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008388322","display_name":"Robert J. Joyce","orcid":"https://orcid.org/0009-0003-7168-1237"},"institutions":[{"id":"https://openalex.org/I1322124587","display_name":"Booz Allen Hamilton (United States)","ror":"https://ror.org/051rcp357","country_code":"US","type":"company","lineage":["https://openalex.org/I1322124587"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Robert J. Joyce","raw_affiliation_strings":["Machine Learning Research Group, Booz Allen Hamilton, USA"],"raw_orcid":"https://orcid.org/0009-0003-7168-1237","affiliations":[{"raw_affiliation_string":"Machine Learning Research Group, Booz Allen Hamilton, USA","institution_ids":["https://openalex.org/I1322124587"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068036546","display_name":"Edward Raff","orcid":"https://orcid.org/0000-0002-9900-1972"},"institutions":[{"id":"https://openalex.org/I1322124587","display_name":"Booz Allen Hamilton (United States)","ror":"https://ror.org/051rcp357","country_code":"US","type":"company","lineage":["https://openalex.org/I1322124587"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Edward Raff","raw_affiliation_strings":["Machine Learning Research Group, Booz Allen Hamilton, USA"],"raw_orcid":"https://orcid.org/0000-0002-9900-1972","affiliations":[{"raw_affiliation_string":"Machine Learning Research Group, Booz Allen Hamilton, USA","institution_ids":["https://openalex.org/I1322124587"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025012064","display_name":"Charles Nicholas","orcid":"https://orcid.org/0000-0001-9494-7139"},"institutions":[{"id":"https://openalex.org/I79272384","display_name":"University of Maryland, Baltimore County","ror":"https://ror.org/02qskvh78","country_code":"US","type":"education","lineage":["https://openalex.org/I79272384"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Charles Nicholas","raw_affiliation_strings":["Department of Computer Science and Electrical Engineering, University of Maryland Baltimore County, USA"],"raw_orcid":"https://orcid.org/0000-0001-9494-7139","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Electrical Engineering, University of Maryland Baltimore County, USA","institution_ids":["https://openalex.org/I79272384"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5079029372","display_name":"Boian S. Alexandrov","orcid":"https://orcid.org/0000-0001-8636-4603"},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Boian S. Alexandrov","raw_affiliation_strings":["Theoretical Division, Los Alamos National Laboratory, USA"],"raw_orcid":"https://orcid.org/0000-0001-8636-4603","affiliations":[{"raw_affiliation_string":"Theoretical Division, Los Alamos National Laboratory, USA","institution_ids":["https://openalex.org/I1343871089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.2171,"has_fulltext":false,"cited_by_count":12,"citation_normalized_percentile":{"value":0.88943915,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":"26","issue":"4","first_page":"1","last_page":"27"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10167","display_name":"Influenza Virus Research Studies","score":0.9896000027656555,"subfield":{"id":"https://openalex.org/subfields/2713","display_name":"Epidemiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/malware","display_name":"Malware","score":0.865829586982727},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7578817009925842},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.6259301900863647},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.6074148416519165},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6054293513298035},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5325943827629089},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.5278663039207458},{"id":"https://openalex.org/keywords/matrix-decomposition","display_name":"Matrix decomposition","score":0.5102601051330566},{"id":"https://openalex.org/keywords/factorization","display_name":"Factorization","score":0.44640350341796875},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.4321499764919281},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.4297848343849182},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.13571658730506897},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.11560991406440735}],"concepts":[{"id":"https://openalex.org/C541664917","wikidata":"https://www.wikidata.org/wiki/Q14001","display_name":"Malware","level":2,"score":0.865829586982727},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7578817009925842},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.6259301900863647},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.6074148416519165},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6054293513298035},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5325943827629089},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.5278663039207458},{"id":"https://openalex.org/C42355184","wikidata":"https://www.wikidata.org/wiki/Q1361088","display_name":"Matrix decomposition","level":3,"score":0.5102601051330566},{"id":"https://openalex.org/C187834632","wikidata":"https://www.wikidata.org/wiki/Q188804","display_name":"Factorization","level":2,"score":0.44640350341796875},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.4321499764919281},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4297848343849182},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.13571658730506897},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.11560991406440735},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C59822182","wikidata":"https://www.wikidata.org/wiki/Q441","display_name":"Botany","level":1,"score":0.0},{"id":"https://openalex.org/C158693339","wikidata":"https://www.wikidata.org/wiki/Q190524","display_name":"Eigenvalues and eigenvectors","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3624567","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3624567","pdf_url":null,"source":{"id":"https://openalex.org/S4210174050","display_name":"ACM Transactions on Privacy and Security","issn_l":"2471-2566","issn":["2471-2566","2471-2574"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Privacy and Security","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Partnerships for the goals","id":"https://metadata.un.org/sdg/17","score":0.49000000953674316}],"awards":[{"id":"https://openalex.org/G2356915424","display_name":null,"funder_award_id":"89233218CNA000001","funder_id":"https://openalex.org/F4320337547","funder_display_name":"Laboratory Directed Research and Development"},{"id":"https://openalex.org/G2776190829","display_name":null,"funder_award_id":"Contract No. 89233218CNA000001","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G3083956167","display_name":null,"funder_award_id":"20190020DR","funder_id":"https://openalex.org/F4320337547","funder_display_name":"Laboratory Directed Research and Development"},{"id":"https://openalex.org/G3495016435","display_name":null,"funder_award_id":"20190020DR","funder_id":"https://openalex.org/F4320338304","funder_display_name":"Los Alamos National Laboratory"},{"id":"https://openalex.org/G4748082740","display_name":null,"funder_award_id":"Contract No. 89233218CNA000001","funder_id":"https://openalex.org/F4320338304","funder_display_name":"Los Alamos National Laboratory"},{"id":"https://openalex.org/G6995210142","display_name":null,"funder_award_id":"89233218CNA000001","funder_id":"https://openalex.org/F4320332369","funder_display_name":"National Nuclear Security Administration"},{"id":"https://openalex.org/G7500863821","display_name":null,"funder_award_id":"89233218CNA000001","funder_id":"https://openalex.org/F4320338304","funder_display_name":"Los Alamos National Laboratory"},{"id":"https://openalex.org/G7977534188","display_name":null,"funder_award_id":"No. 89233218CNA000001","funder_id":"https://openalex.org/F4320332369","funder_display_name":"National Nuclear Security Administration"},{"id":"https://openalex.org/G8659807574","display_name":null,"funder_award_id":"89233218CNA000001","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G8770029225","display_name":null,"funder_award_id":"No. 89233218CNA000001","funder_id":"https://openalex.org/F4320338304","funder_display_name":"Los Alamos National Laboratory"}],"funders":[{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320332369","display_name":"National Nuclear Security Administration","ror":"https://ror.org/03sk1we31"},{"id":"https://openalex.org/F4320337547","display_name":"Laboratory Directed Research and Development","ror":"https://ror.org/01e41cf67"},{"id":"https://openalex.org/F4320338304","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":58,"referenced_works":["https://openalex.org/W191494004","https://openalex.org/W1902027874","https://openalex.org/W1965026065","https://openalex.org/W1981221397","https://openalex.org/W1986966428","https://openalex.org/W1987971958","https://openalex.org/W2002011878","https://openalex.org/W2008255210","https://openalex.org/W2010065958","https://openalex.org/W2013029404","https://openalex.org/W2069870183","https://openalex.org/W2077583079","https://openalex.org/W2095088894","https://openalex.org/W2101210369","https://openalex.org/W2116542188","https://openalex.org/W2118375674","https://openalex.org/W2120379285","https://openalex.org/W2136787567","https://openalex.org/W2152061559","https://openalex.org/W2187089797","https://openalex.org/W2307930854","https://openalex.org/W2556522401","https://openalex.org/W2742475488","https://openalex.org/W2747969544","https://openalex.org/W2754585330","https://openalex.org/W2791541601","https://openalex.org/W2795063185","https://openalex.org/W2914874661","https://openalex.org/W2931858311","https://openalex.org/W2941306455","https://openalex.org/W2949676527","https://openalex.org/W2963815651","https://openalex.org/W2980212355","https://openalex.org/W2980750320","https://openalex.org/W3035578002","https://openalex.org/W3036886045","https://openalex.org/W3087125040","https://openalex.org/W3098166657","https://openalex.org/W3102476541","https://openalex.org/W3130402426","https://openalex.org/W3172901711","https://openalex.org/W3191250776","https://openalex.org/W3193658714","https://openalex.org/W4221146353","https://openalex.org/W4221146360","https://openalex.org/W4241184565","https://openalex.org/W4247773991","https://openalex.org/W4251841127","https://openalex.org/W4287121745","https://openalex.org/W4287666927","https://openalex.org/W4287671087","https://openalex.org/W4287753870","https://openalex.org/W4296819410","https://openalex.org/W4360765064","https://openalex.org/W4367839941","https://openalex.org/W4386556553","https://openalex.org/W4390187512","https://openalex.org/W6888810450"],"related_works":["https://openalex.org/W17155033","https://openalex.org/W3207760230","https://openalex.org/W1496222301","https://openalex.org/W1590307681","https://openalex.org/W4312814274","https://openalex.org/W4285370786","https://openalex.org/W2296488620","https://openalex.org/W1966145327","https://openalex.org/W1973739845","https://openalex.org/W119752240"],"abstract_inverted_index":{"Identification":[0],"of":[1,16,37,57,80,120,143,146,157,175,195,205,212,221,256],"the":[2,14,17,35,48,55,73,108,117,121,144,154,158,193,203,206,238],"family":[3,123],"to":[4,34,50,169],"which":[5,105,166,188],"a":[6,77,86,100,163,209],"malware":[7,18,70,83,122,159,171,197,228],"specimen":[8],"belongs":[9],"is":[10,128,215],"essential":[11],"in":[12,116,192],"understanding":[13],"behavior":[15],"and":[19,54,68,98,199,226,248],"developing":[20],"mitigation":[21],"strategies.":[22],"Solutions":[23],"proposed":[24],"by":[25],"prior":[26],"work,":[27],"however,":[28],"are":[29],"often":[30],"not":[31],"practicable":[32],"due":[33],"lack":[36],"realistic":[38],"evaluation":[39],"factors.":[40],"These":[41],"factors":[42],"include":[43],"learning":[44],"under":[45,173],"class":[46,177],"imbalance,":[47],"ability":[49],"identify":[51],"new":[52,69],"malware,":[53],"cost":[56],"production-quality":[58],"labeled":[59,82,213],"data.":[60],"In":[61,91,241],"practice,":[62],"deployed":[63],"models":[64,251],"face":[65],"prominent,":[66],"rare,":[67],"families.":[71],"At":[72],"same":[74],"time,":[75],"obtaining":[76],"large":[78],"quantity":[79,211],"up-to-date":[81],"for":[84],"training":[85],"model":[87,136,207],"can":[88,113,181],"be":[89,114],"expensive.":[90],"this":[92],"article,":[93],"we":[94,106,152,244],"address":[95],"these":[96],"problems":[97],"propose":[99],"novel":[101,196],"hierarchical":[102,155],"semi-supervised":[103,164,249],"algorithm,":[104],"call":[107],"HNMFk":[109,149],"Classifier":[110,150],",":[111,151],"that":[112,138],"used":[115],"early":[118],"stages":[119],"labeling":[124],"process.":[125],"Our":[126,179],"method":[127],"based":[129],"on":[130],"non-negative":[131],"matrix":[132],"factorization":[133],"with":[134,140,162,201,252],"automatic":[135],"selection,":[137],"is,":[139],"an":[141,253],"estimation":[142],"number":[145],"clusters.":[147],"With":[148],"exploit":[153],"structure":[156],"data":[160,214],"together":[161],"setup,":[165],"enables":[167],"us":[168],"classify":[170],"families":[172,198],"conditions":[174],"extreme":[176],"imbalance.":[178],"solution":[180],"perform":[182,218],"abstaining":[183],"predictions,":[184],"or":[185],"rejection":[186],"option,":[187],"yields":[189],"promising":[190],"results":[191],"identification":[194],"helps":[200],"maintaining":[202],"performance":[204],"when":[208],"low":[210],"used.":[216],"We":[217],"bulk":[219],"classification":[220],"nearly":[222,234],"2,900":[223],"both":[224,246],"rare":[225],"prominent":[227],"families,":[229],"through":[230],"static":[231],"analysis,":[232],"using":[233],"388,000":[235],"samples":[236],"from":[237],"EMBER-2018":[239],"corpus.":[240],"our":[242],"experiments,":[243],"surpass":[245],"supervised":[247],"baseline":[250],"F1":[254],"score":[255],"0.80.":[257]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
