File size: 1,366 Bytes

4de3b20
 
 
 
79eec1d
4de3b20
79eec1d
 
 
4de3b20
 
 
79eec1d
 
 
 
 
 
 
 
 
4de3b20
79eec1d
 
 
 
 
 
 
4de3b20
79eec1d

from datasets import load_dataset
import json
import re
from tqdm import tqdm
from filter import filterdata  # Custom filtering logic

# Load 110k samples from OpenWebText
print("📦 Loading dataset (110k samples)...")
ds = load_dataset("OpenAssistant/oasst1",split="train")

convo = []

print("⚙️ Processing dataset into Q&A pairs...")
for entry in tqdm(ds, unit='samples'):
    if entry.get("role") == "assistant" and entry.get("text") and entry.get("parent_id"):
        parent = next((x for x in ds if x["message_id"] == entry["parent_id"]), None)
        if parent and parent.get("role") == "user":
            convo.append({
                "input": parent["text"],
                "output": entry["text"]
            })

    #convo.append({
    #    "instruction": instruction,
    #    "input": user_input,
    #    "output": bot_response,
    #    "text": full_instruction + "\n" + bot_response
    #})
    

    

print(f"✅ Got {len(convo)} usable Q&A pairs.")

# Save unfiltered data
unfiltered_path = "./data/unfiltered_data.jsonl"
with open(unfiltered_path, "w", encoding="utf-8") as f:
    for line in convo:
        f.write(json.dumps(line, ensure_ascii=False) + "\n")

print(f"📝 Saved unfiltered data to {unfiltered_path}")

# Run filtering
print("🚿 Starting filtering...")
filterdata(convo)