{"id":"https://openalex.org/W4403536772","doi":"https://doi.org/10.1145/3691620.3695018","title":"Efficient Detection of Toxic Prompts in Large Language Models","display_name":"Efficient Detection of Toxic Prompts in Large Language Models","publication_year":2024,"publication_date":"2024-10-18","ids":{"openalex":"https://openalex.org/W4403536772","doi":"https://doi.org/10.1145/3691620.3695018"},"language":"en","primary_location":{"id":"doi:10.1145/3691620.3695018","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3691620.3695018","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 39th IEEE/ACM International Conference on Automated Software Engineering","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3691620.3695018","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100330541","display_name":"Yi Liu","orcid":"https://orcid.org/0000-0002-4978-127X"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Yi Liu","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110699457","display_name":"J. Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junzhe Yu","raw_affiliation_strings":["ShanghaiTech University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ShanghaiTech University, Shanghai, China","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095377076","display_name":"Huijia Sun","orcid":"https://orcid.org/0009-0000-8504-2733"},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huijia Sun","raw_affiliation_strings":["ShanghaiTech University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ShanghaiTech University, Shanghai, China","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085767410","display_name":"Ling Shi","orcid":"https://orcid.org/0000-0002-2023-0247"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Ling Shi","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029243457","display_name":"Gelei Deng","orcid":"https://orcid.org/0000-0002-0046-6674"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Gelei Deng","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026181425","display_name":"Yuqi Chen","orcid":"https://orcid.org/0000-0003-2988-6012"},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuqi Chen","raw_affiliation_strings":["ShanghaiTech University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ShanghaiTech University, Shanghai, China","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100355692","display_name":"Yang Liu","orcid":"https://orcid.org/0000-0001-7300-9215"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yang Liu","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100330541"],"corresponding_institution_ids":["https://openalex.org/I172675005"],"apc_list":null,"apc_paid":null,"fwci":1.7376,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.87255456,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"455","last_page":"467"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9868000149726868,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9868000149726868,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9855999946594238,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9848999977111816,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6992861032485962},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4766809940338135},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.39078229665756226}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6992861032485962},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4766809940338135},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39078229665756226}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3691620.3695018","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3691620.3695018","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 39th IEEE/ACM International Conference on Automated Software Engineering","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3691620.3695018","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3691620.3695018","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 39th IEEE/ACM International Conference on Automated Software Engineering","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W2535617737","https://openalex.org/W2766540688","https://openalex.org/W2786672974","https://openalex.org/W4220964139","https://openalex.org/W4290943938","https://openalex.org/W4308391526","https://openalex.org/W4381586841","https://openalex.org/W4385452929","https://openalex.org/W4386794464","https://openalex.org/W4387355345","https://openalex.org/W4391591467","https://openalex.org/W4391632220","https://openalex.org/W4391833847","https://openalex.org/W4391940625","https://openalex.org/W4391988042","https://openalex.org/W4392677961","https://openalex.org/W4393146542","https://openalex.org/W4396914181","https://openalex.org/W4398794941","https://openalex.org/W4399205773","https://openalex.org/W4399795012","https://openalex.org/W4402670895","https://openalex.org/W4402683786","https://openalex.org/W4404781977","https://openalex.org/W6600234944","https://openalex.org/W6600424091","https://openalex.org/W6826255127"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Large":[0],"language":[1,12],"models":[2,26],"(LLMs)":[3],"like":[4],"ChatGPT":[5],"and":[6,20,67,79,114,134,146,178],"Gemini":[7],"have":[8],"significantly":[9],"advanced":[10],"natural":[11],"processing,":[13],"enabling":[14],"various":[15,127],"applications":[16],"such":[17],"as":[18],"chatbots":[19],"automated":[21],"content":[22],"generation.":[23],"However,":[24],"these":[25],"can":[27],"be":[28],"exploited":[29],"by":[30],"malicious":[31],"individuals":[32,44],"who":[33],"craft":[34],"toxic":[35,58,76,95,104,186],"prompts":[36,96],"to":[37,49,72,92,102,110],"elicit":[38],"harmful":[39],"or":[40],"unethical":[41],"responses.":[42],"These":[43],"often":[45],"employ":[46],"jailbreaking":[47],"techniques":[48],"bypass":[50],"safety":[51],"mechanisms,":[52],"highlighting":[53],"the":[54,73,130],"need":[55],"for":[56,121,170,185],"robust":[57],"prompt":[59,122,165,187],"detection":[60,63,188],"methods.":[61,156],"Existing":[62],"techniques,":[64],"both":[65],"blackbox":[66],"whitebox,":[68],"face":[69],"challenges":[70],"related":[71],"diversity":[74],"of":[75,129,144,152,161],"prompts,":[77,106],"scalability,":[78,179],"computational":[80],"efficiency.":[81],"In":[82],"response,":[83],"we":[84],"propose":[85],"ToxicDetector,":[86],"a":[87,116,141,147,182],"lightweight":[88],"greybox":[89],"method":[90,184],"designed":[91],"efficiently":[93],"detect":[94],"in":[97,189],"LLMs.":[98,190],"ToxicDetector":[99,139,173],"leverages":[100],"LLMs":[101],"create":[103],"concept":[105],"uses":[107],"embedding":[108],"vectors":[109],"form":[111],"feature":[112],"vectors,":[113],"employs":[115],"Multi-Layer":[117],"Perceptron":[118],"(MLP)":[119],"classifier":[120],"classification.":[123],"Our":[124],"evaluation":[125],"on":[126],"versions":[128],"LLama":[131],"models,":[132],"Gemma-2,":[133],"multiple":[135],"datasets":[136],"demonstrates":[137],"that":[138],"achieves":[140,174],"high":[142,175],"accuracy":[143],"96.39%":[145],"low":[148],"false":[149],"positive":[150],"rate":[151],"2.00%,":[153],"outperforming":[154],"state-of-the-art":[155],"Additionally,":[157],"ToxicDetector's":[158],"processing":[159],"time":[160],"0.0780":[162],"seconds":[163],"per":[164],"makes":[166],"it":[167,181],"highly":[168],"suitable":[169],"real-time":[171],"applications.":[172],"accuracy,":[176],"efficiency,":[177],"making":[180],"practical":[183]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":4}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
