{"id":"https://openalex.org/W3215448314","doi":"https://doi.org/10.1109/isi53945.2021.9624765","title":"Identifying and Categorizing Malicious Content on Paste Sites: A Neural Topic Modeling Approach","display_name":"Identifying and Categorizing Malicious Content on Paste Sites: A Neural Topic Modeling Approach","publication_year":2021,"publication_date":"2021-11-02","ids":{"openalex":"https://openalex.org/W3215448314","doi":"https://doi.org/10.1109/isi53945.2021.9624765","mag":"3215448314"},"language":"en","primary_location":{"id":"doi:10.1109/isi53945.2021.9624765","is_oa":false,"landing_page_url":"https://doi.org/10.1109/isi53945.2021.9624765","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE International Conference on Intelligence and Security Informatics (ISI)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010067332","display_name":"Tala Vahedi","orcid":"https://orcid.org/0000-0002-2353-7244"},"institutions":[{"id":"https://openalex.org/I138006243","display_name":"University of Arizona","ror":"https://ror.org/03m2x1q45","country_code":"US","type":"education","lineage":["https://openalex.org/I138006243"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Tala Vahedi","raw_affiliation_strings":["University of Arizona, Tucson, AZ, USA"],"affiliations":[{"raw_affiliation_string":"University of Arizona, Tucson, AZ, USA","institution_ids":["https://openalex.org/I138006243"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075447553","display_name":"Benjamin Ampel","orcid":"https://orcid.org/0000-0003-0603-0270"},"institutions":[{"id":"https://openalex.org/I138006243","display_name":"University of Arizona","ror":"https://ror.org/03m2x1q45","country_code":"US","type":"education","lineage":["https://openalex.org/I138006243"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Benjamin Ampel","raw_affiliation_strings":["University of Arizona, Tucson, AZ, USA"],"affiliations":[{"raw_affiliation_string":"University of Arizona, Tucson, AZ, USA","institution_ids":["https://openalex.org/I138006243"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038811607","display_name":"Sagar Samtani","orcid":"https://orcid.org/0000-0002-4513-805X"},"institutions":[{"id":"https://openalex.org/I4210119109","display_name":"Indiana University Bloomington","ror":"https://ror.org/02k40bc56","country_code":"US","type":"education","lineage":["https://openalex.org/I4210119109","https://openalex.org/I592451"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sagar Samtani","raw_affiliation_strings":["Indiana University, Bloomington, IN, USA"],"affiliations":[{"raw_affiliation_string":"Indiana University, Bloomington, IN, USA","institution_ids":["https://openalex.org/I4210119109"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017102020","display_name":"Hsinchun Chen","orcid":"https://orcid.org/0000-0003-3251-2433"},"institutions":[{"id":"https://openalex.org/I138006243","display_name":"University of Arizona","ror":"https://ror.org/03m2x1q45","country_code":"US","type":"education","lineage":["https://openalex.org/I138006243"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hsinchun Chen","raw_affiliation_strings":["University of Arizona, Tucson, AZ, USA"],"affiliations":[{"raw_affiliation_string":"University of Arizona, Tucson, AZ, USA","institution_ids":["https://openalex.org/I138006243"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5010067332"],"corresponding_institution_ids":["https://openalex.org/I138006243"],"apc_list":null,"apc_paid":null,"fwci":1.9279,"has_fulltext":false,"cited_by_count":12,"citation_normalized_percentile":{"value":0.89029063,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12519","display_name":"Cybercrime and Law Enforcement Studies","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12519","display_name":"Cybercrime and Law Enforcement Studies","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9890999794006348,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/latent-dirichlet-allocation","display_name":"Latent Dirichlet allocation","score":0.8609733581542969},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7502206563949585},{"id":"https://openalex.org/keywords/topic-model","display_name":"Topic model","score":0.5824291110038757},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.48917892575263977},{"id":"https://openalex.org/keywords/social-media","display_name":"Social media","score":0.45369815826416016},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4353906214237213},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4299841821193695},{"id":"https://openalex.org/keywords/malware","display_name":"Malware","score":0.41383013129234314},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.34969213604927063},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3410201668739319},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.28725308179855347},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.25583213567733765},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09720295667648315}],"concepts":[{"id":"https://openalex.org/C500882744","wikidata":"https://www.wikidata.org/wiki/Q269236","display_name":"Latent Dirichlet allocation","level":3,"score":0.8609733581542969},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7502206563949585},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.5824291110038757},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.48917892575263977},{"id":"https://openalex.org/C518677369","wikidata":"https://www.wikidata.org/wiki/Q202833","display_name":"Social media","level":2,"score":0.45369815826416016},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4353906214237213},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4299841821193695},{"id":"https://openalex.org/C541664917","wikidata":"https://www.wikidata.org/wiki/Q14001","display_name":"Malware","level":2,"score":0.41383013129234314},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34969213604927063},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3410201668739319},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.28725308179855347},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.25583213567733765},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09720295667648315},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/isi53945.2021.9624765","is_oa":false,"landing_page_url":"https://doi.org/10.1109/isi53945.2021.9624765","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE International Conference on Intelligence and Security Informatics (ISI)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.550000011920929,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1168850612","https://openalex.org/W2013676140","https://openalex.org/W2554587771","https://openalex.org/W2556888587","https://openalex.org/W2587739609","https://openalex.org/W2787803546","https://openalex.org/W2889903231","https://openalex.org/W2896457183","https://openalex.org/W2917807496","https://openalex.org/W2952478253","https://openalex.org/W2955328590","https://openalex.org/W2970047711","https://openalex.org/W2978017171","https://openalex.org/W2990874597","https://openalex.org/W3011574394","https://openalex.org/W3016147497","https://openalex.org/W3035332461","https://openalex.org/W3035390927","https://openalex.org/W3086302916","https://openalex.org/W3088409176","https://openalex.org/W3106373739","https://openalex.org/W3109078334","https://openalex.org/W3113358185","https://openalex.org/W6729752019","https://openalex.org/W6734688555","https://openalex.org/W6747981893","https://openalex.org/W6755207826","https://openalex.org/W6768851824","https://openalex.org/W6770691796","https://openalex.org/W6786188428"],"related_works":["https://openalex.org/W2769501189","https://openalex.org/W4315588616","https://openalex.org/W4312773271","https://openalex.org/W2888805565","https://openalex.org/W2962686197","https://openalex.org/W3005513013","https://openalex.org/W2207653751","https://openalex.org/W2611137333","https://openalex.org/W4389543811","https://openalex.org/W4291700620"],"abstract_inverted_index":{"Malicious":[0],"cyber":[1],"activities":[2],"impose":[3],"substantial":[4],"costs":[5],"on":[6,148,174,203,222],"the":[7,111,116,128,133,136,140,159,163],"U.S.":[8],"economy":[9],"and":[10,46,62,143,166,196,199],"global":[11],"markets.":[12],"Cyber-criminals":[13],"often":[14],"use":[15],"information-sharing":[16],"social":[17],"media":[18],"platforms":[19],"such":[20],"as":[21],"paste":[22,153,176,204],"sites":[23,51],"(e.g.,":[24,146],"Pastebin)":[25],"to":[26,35,84,98,189,217],"share":[27],"vast":[28],"amounts":[29],"of":[30,135,172,179],"plain":[31],"text":[32],"content":[33,187],"related":[34],"Personally":[36],"Identifiable":[37],"Information":[38],"(PII),":[39],"credit":[40],"card":[41],"numbers,":[42],"exploit":[43],"code,":[44,194],"malware,":[45],"other":[47],"sensitive":[48],"content.":[49],"Paste":[50],"can":[52],"provide":[53],"targeted":[54],"Cyber":[55],"Threat":[56],"Intelligence":[57],"(CTI)":[58],"about":[59],"potential":[60,220],"threats":[61],"prior":[63],"breaches.":[64],"In":[65],"this":[66,210],"research,":[67],"we":[68],"propose":[69],"a":[70,93,107,120],"novel":[71],"Bidirectional":[72],"Encoder":[73],"Representation":[74],"from":[75,151],"Transformers":[76],"(BERT)":[77],"with":[78,119],"Latent":[79],"Dirichlet":[80],"Allocation":[81],"(LDA)":[82],"model":[83,91],"categorize":[85],"pastes":[86,150],"automatically.":[87],"Our":[88],"proposed":[89,137,160],"BERT-LDA":[90,109,138,144,161,168,181],"leverages":[92],"neural":[94],"network":[95,195],"transformer":[96],"architecture":[97],"capture":[99],"sequential":[100],"dependencies":[101],"when":[102],"representing":[103],"each":[104,167,175],"sentence":[105],"in":[106,115,170],"paste.":[108],"replaces":[110],"Bag-of-Words":[112],"(BoW)":[113],"approach":[114],"conventional":[117,141],"LDA":[118,142,165],"Bag-of-Labels":[121],"(BoL)":[122],"that":[123,158,185],"encompasses":[124],"class":[125],"labels":[126],"at":[127],"sequence":[129],"level.":[130],"We":[131],"compared":[132],"performance":[134],"against":[139],"variants":[145],"GPT2-LDA)":[147],"4,254,453":[149],"three":[152],"sites.":[154,205],"Experiment":[155],"results":[156],"indicate":[157],"outperformed":[162],"standard":[164],"variant":[169],"terms":[171],"perplexity":[173],"site.":[177],"Results":[178],"our":[180],"case":[182],"study":[183,211],"suggest":[184],"significant":[186],"relating":[188],"hacker":[190],"community":[191],"activities,":[192],"malicious":[193],"website":[197],"vulnerabilities,":[198],"PII":[200],"are":[201],"shared":[202],"The":[206],"insights":[207],"provided":[208],"by":[209,215],"could":[212],"be":[213],"used":[214],"organizations":[216],"proactively":[218],"mitigate":[219],"damage":[221],"their":[223],"infrastructure.":[224]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
