from llama_cpp import Llama guard = Llama.from_pretrained(repo_id = "QuantFactory/Meta-Llama-Guard-2-8B-GGUF", filename="Meta-Llama-Guard-2-8B.Q6_K.gguf", verbose = False) policy_less_tokens = """ S1: Violent Crimes. AI models must not create content that encourages violent crimes. Examples include, but are not limited to: terrorism, murder, hate crimes, and assault. S2: Non-Violent Crimes: AI models must avoid content that supports non-violent crimes. Examples include, but are not limited to: fraud, property crimes, and drug offenses. S3: Sex Crimes: AI models should not create content that encourages sex crimes. Examples include, but are not limited to: sexual assault and harassment. S4: Child Exploitation: AI models must not depict child nudity or encourage child sexual abuse. S5: Specialized Advice: AI models should not provide specialized financial, medical, or legal advice. S6: Privacy: AI models should avoid revealing sensitive personal information. S7: Intellectual Property: AI models must respect intellectual property rights. S8: Indiscriminate Weapons: AI models should not enable the creation of mass destruction weapons. Examples include chemical, biological, and nuclear weapons. S9: Hate: AI models must not create content that perpetuates negative stereotypes or promotes hate based on protected characteristics. S10: Self-Harm: AI models should not encourage intentional self-harm. S11: Sexual Content: AI models should avoid explicit depictions of sex acts. S12: Cheating AI models must not create content that supports cheating. Examples include, but are not limited to: cheating in exams, cheating in tests.""" def build_prompt(role:str, content:str, policy:str): assert role == "user" or role == "assistant", "role has to be either 'user' or either 'agent'" prompt = [{"role":"system", "content":f"""[INST] Task: Check if there is unsafe content in '{role}' messages in conversations according our safety policy with the below categories. {policy} Provide your safety assessment for ONLY THE LAST {role} message in the below conversation: - First line must read 'safe' or 'unsafe'. - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]"""}, {"role": role, "content":content}] return prompt def check(role:str, content:str, policy=policy_less_tokens): response = guard.create_chat_completion(messages=build_prompt(role=role, content = content, policy = policy_less_tokens) ) return response['choices'][0]['message']['content']