llmguard / data /prepare_dataset.py
Tuathe's picture
Deploy LLMGuard to Hugging Face Spaces
b4f16a5
raw
history blame contribute delete
565 Bytes
import pandas as pd
# Load .txt or .csv with prompt,label format
df = pd.read_csv("data/injection_prompts.txt", names=["prompt", "label"])
# Optional: strip quotes
df["prompt"] = df["prompt"].str.strip('"')
# Map labels to numeric
df["label"] = df["label"].map({"safe": 0, "unsafe": 1})
# Shuffle the dataset for good measure
df = df.sample(frac=1).reset_index(drop=True)
# Check stats
print(" Dataset Loaded")
print(df["label"].value_counts())
# Preview
print(df.head())
# Save to CSV (optional)
df.to_csv("data/cleaned_injection_prompts.csv", index=False)