File size: 1,366 Bytes
4de3b20 79eec1d 4de3b20 79eec1d 4de3b20 79eec1d 4de3b20 79eec1d 4de3b20 79eec1d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
from datasets import load_dataset
import json
import re
from tqdm import tqdm
from filter import filterdata # Custom filtering logic
# Load 110k samples from OpenWebText
print("๐ฆ Loading dataset (110k samples)...")
ds = load_dataset("OpenAssistant/oasst1",split="train")
convo = []
print("โ๏ธ Processing dataset into Q&A pairs...")
for entry in tqdm(ds, unit='samples'):
if entry.get("role") == "assistant" and entry.get("text") and entry.get("parent_id"):
parent = next((x for x in ds if x["message_id"] == entry["parent_id"]), None)
if parent and parent.get("role") == "user":
convo.append({
"input": parent["text"],
"output": entry["text"]
})
#convo.append({
# "instruction": instruction,
# "input": user_input,
# "output": bot_response,
# "text": full_instruction + "\n" + bot_response
#})
print(f"โ
Got {len(convo)} usable Q&A pairs.")
# Save unfiltered data
unfiltered_path = "./data/unfiltered_data.jsonl"
with open(unfiltered_path, "w", encoding="utf-8") as f:
for line in convo:
f.write(json.dumps(line, ensure_ascii=False) + "\n")
print(f"๐ Saved unfiltered data to {unfiltered_path}")
# Run filtering
print("๐ฟ Starting filtering...")
filterdata(convo)
|