from datasets import load_dataset | |
import json | |
import re | |
from tqdm import tqdm | |
from filter import filterdata # Custom filtering logic | |
# Load 110k samples from OpenWebText | |
print("π¦ Loading dataset (110k samples)...") | |
ds = load_dataset("OpenAssistant/oasst1",split="train") | |
convo = [] | |
print("βοΈ Processing dataset into Q&A pairs...") | |
for entry in tqdm(ds, unit='samples'): | |
if entry.get("role") == "assistant" and entry.get("text") and entry.get("parent_id"): | |
parent = next((x for x in ds if x["message_id"] == entry["parent_id"]), None) | |
if parent and parent.get("role") == "user": | |
convo.append({ | |
"input": parent["text"], | |
"output": entry["text"] | |
}) | |
#convo.append({ | |
# "instruction": instruction, | |
# "input": user_input, | |
# "output": bot_response, | |
# "text": full_instruction + "\n" + bot_response | |
#}) | |
print(f"β Got {len(convo)} usable Q&A pairs.") | |
# Save unfiltered data | |
unfiltered_path = "./data/unfiltered_data.jsonl" | |
with open(unfiltered_path, "w", encoding="utf-8") as f: | |
for line in convo: | |
f.write(json.dumps(line, ensure_ascii=False) + "\n") | |
print(f"π Saved unfiltered data to {unfiltered_path}") | |
# Run filtering | |
print("πΏ Starting filtering...") | |
filterdata(convo) | |