{"id":"https://openalex.org/W4389713766","doi":"https://doi.org/10.48550/arxiv.2312.06674","title":"Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations","display_name":"Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations","publication_year":2023,"publication_date":"2023-12-07","ids":{"openalex":"https://openalex.org/W4389713766","doi":"https://doi.org/10.48550/arxiv.2312.06674"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2312.06674","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.06674","pdf_url":"https://arxiv.org/pdf/2312.06674","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2312.06674","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071606441","display_name":"Hakan Inan","orcid":"https://orcid.org/0000-0003-3505-8838"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Inan, Hakan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019445300","display_name":"Kartikeya Upasani","orcid":"https://orcid.org/0000-0001-8623-4243"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Upasani, Kartikeya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111524272","display_name":"Jianfeng Chi","orcid":"https://orcid.org/0009-0002-1138-5009"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chi, Jianfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035660188","display_name":"Rashi Rungta","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rungta, Rashi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079492567","display_name":"Krithika Iyer","orcid":"https://orcid.org/0000-0003-2295-8618"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Iyer, Krithika","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012370787","display_name":"Yuning Mao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mao, Yuning","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093487556","display_name":"Michael Tontchev","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tontchev, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101778507","display_name":"Qing Hu","orcid":"https://orcid.org/0000-0001-8067-6569"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Qing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036265899","display_name":"Brian Fuller","orcid":"https://orcid.org/0000-0001-5865-2528"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fuller, Brian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018125347","display_name":"Davide Testuggine","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Testuggine, Davide","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5054253075","display_name":"Madian Khabsa","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Khabsa, Madian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":48,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.9706000089645386,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.9706000089645386,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12151","display_name":"Interpreting and Communication in Healthcare","score":0.9320999979972839,"subfield":{"id":"https://openalex.org/subfields/3600","display_name":"General Health Professions"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7105655670166016},{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.6023306846618652},{"id":"https://openalex.org/keywords/guard","display_name":"Guard (computer science)","score":0.5048578381538391},{"id":"https://openalex.org/keywords/moderation","display_name":"Moderation","score":0.4568851590156555},{"id":"https://openalex.org/keywords/taxonomy","display_name":"Taxonomy (biology)","score":0.42304155230522156},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3864237666130066},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.32288432121276855},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.23883318901062012},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.08237966895103455}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7105655670166016},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.6023306846618652},{"id":"https://openalex.org/C141141315","wikidata":"https://www.wikidata.org/wiki/Q2379942","display_name":"Guard (computer science)","level":2,"score":0.5048578381538391},{"id":"https://openalex.org/C93225998","wikidata":"https://www.wikidata.org/wiki/Q1941972","display_name":"Moderation","level":2,"score":0.4568851590156555},{"id":"https://openalex.org/C58642233","wikidata":"https://www.wikidata.org/wiki/Q8269924","display_name":"Taxonomy (biology)","level":2,"score":0.42304155230522156},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3864237666130066},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32288432121276855},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.23883318901062012},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.08237966895103455},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C59822182","wikidata":"https://www.wikidata.org/wiki/Q441","display_name":"Botany","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2312.06674","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.06674","pdf_url":"https://arxiv.org/pdf/2312.06674","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2312.06674","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2312.06674","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2312.06674","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.06674","pdf_url":"https://arxiv.org/pdf/2312.06674","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"score":0.7099999785423279,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4389713766.pdf","grobid_xml":"https://content.openalex.org/works/W4389713766.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W1968552888","https://openalex.org/W2374116601","https://openalex.org/W3093134843","https://openalex.org/W2772323916","https://openalex.org/W1511346092","https://openalex.org/W1527532029","https://openalex.org/W2378167147","https://openalex.org/W3210777354","https://openalex.org/W2281307425","https://openalex.org/W2464405057"],"abstract_inverted_index":{"We":[0,192],"introduce":[1],"Llama":[2,81,125,146,195],"Guard,":[3,82],"an":[4],"LLM-based":[5],"input-output":[6],"safeguard":[7],"model":[8,16,85,197],"geared":[9],"towards":[10],"Human-AI":[11],"conversation":[12],"use":[13,178],"cases.":[14],"Our":[15],"incorporates":[17],"a":[18,22,27,55,76,83,129],"safety":[19,31],"risk":[20],"taxonomy,":[21],"valuable":[23],"tool":[24],"for":[25,149,218],"categorizing":[26],"specific":[28,177],"set":[29],"of":[30,66,78,119,145,152,157,171,215],"risks":[32],"found":[33],"in":[34,45,95],"LLM":[35],"prompts":[36],"(i.e.,":[37],"prompt":[38,68],"classification).":[39],"This":[40,160],"taxonomy":[41,172],"is":[42,87],"also":[43],"instrumental":[44],"classifying":[46],"the":[47,64,105,142,150,155,163,169,190,212,216],"responses":[48],"generated":[49],"by":[50],"LLMs":[51],"to":[52,59,174,204,210],"these":[53],"prompts,":[54],"process":[56],"we":[57,72,201],"refer":[58],"as":[60,104,128,167],"response":[61,70],"classification.":[62],"For":[63],"purpose":[65],"both":[67],"and":[69,110,136,154,180,200,207],"classification,":[71],"have":[73],"meticulously":[74],"gathered":[75],"dataset":[77,109],"high":[79],"quality.":[80],"Llama2-7b":[84],"that":[86,118],"instruction-tuned":[88],"on":[89,100],"our":[90],"collected":[91],"dataset,":[92],"albeit":[93],"low":[94],"volume,":[96],"demonstrates":[97],"strong":[98],"performance":[99,114],"existing":[101],"benchmarks":[102],"such":[103,166],"OpenAI":[106],"Moderation":[107],"Evaluation":[108],"ToxicChat,":[111],"where":[112],"its":[113],"matches":[115],"or":[116,183],"exceeds":[117],"currently":[120],"available":[121,199],"content":[122],"moderation":[123],"tools.":[124],"Guard":[126,147,196],"functions":[127],"language":[130],"model,":[131],"carrying":[132],"out":[133],"multi-class":[134],"classification":[135],"generating":[137],"binary":[138],"decision":[139],"scores.":[140],"Furthermore,":[141],"instruction":[143],"fine-tuning":[144],"allows":[148],"customization":[151],"tasks":[153],"adaptation":[156],"output":[158],"formats.":[159],"feature":[161],"enhances":[162],"model's":[164],"capabilities,":[165],"enabling":[168],"adjustment":[170],"categories":[173],"align":[175],"with":[176,186],"cases,":[179],"facilitating":[181],"zero-shot":[182],"few-shot":[184],"prompting":[185],"diverse":[187],"taxonomies":[188],"at":[189],"input.":[191],"are":[193],"making":[194],"weights":[198],"encourage":[202],"researchers":[203],"further":[205],"develop":[206],"adapt":[208],"them":[209],"meet":[211],"evolving":[213],"needs":[214],"community":[217],"AI":[219],"safety.":[220]},"counts_by_year":[{"year":2026,"cited_by_count":6},{"year":2025,"cited_by_count":28},{"year":2024,"cited_by_count":14}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
