MiniGPT / datasetgen.py
CreatedNull's picture
Upload folder using huggingface_hub
79eec1d verified
from datasets import load_dataset
import json
import re
from tqdm import tqdm
from filter import filterdata # Custom filtering logic
# Load 110k samples from OpenWebText
print("πŸ“¦ Loading dataset (110k samples)...")
ds = load_dataset("OpenAssistant/oasst1",split="train")
convo = []
print("βš™οΈ Processing dataset into Q&A pairs...")
for entry in tqdm(ds, unit='samples'):
if entry.get("role") == "assistant" and entry.get("text") and entry.get("parent_id"):
parent = next((x for x in ds if x["message_id"] == entry["parent_id"]), None)
if parent and parent.get("role") == "user":
convo.append({
"input": parent["text"],
"output": entry["text"]
})
#convo.append({
# "instruction": instruction,
# "input": user_input,
# "output": bot_response,
# "text": full_instruction + "\n" + bot_response
#})
print(f"βœ… Got {len(convo)} usable Q&A pairs.")
# Save unfiltered data
unfiltered_path = "./data/unfiltered_data.jsonl"
with open(unfiltered_path, "w", encoding="utf-8") as f:
for line in convo:
f.write(json.dumps(line, ensure_ascii=False) + "\n")
print(f"πŸ“ Saved unfiltered data to {unfiltered_path}")
# Run filtering
print("🚿 Starting filtering...")
filterdata(convo)