File size: 1,366 Bytes
4de3b20
 
 
 
79eec1d
4de3b20
79eec1d
 
 
4de3b20
 
 
79eec1d
 
 
 
 
 
 
 
 
4de3b20
79eec1d
 
 
 
 
 
 
4de3b20
79eec1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from datasets import load_dataset
import json
import re
from tqdm import tqdm
from filter import filterdata  # Custom filtering logic

# Load 110k samples from OpenWebText
print("๐Ÿ“ฆ Loading dataset (110k samples)...")
ds = load_dataset("OpenAssistant/oasst1",split="train")

convo = []

print("โš™๏ธ Processing dataset into Q&A pairs...")
for entry in tqdm(ds, unit='samples'):
    if entry.get("role") == "assistant" and entry.get("text") and entry.get("parent_id"):
        parent = next((x for x in ds if x["message_id"] == entry["parent_id"]), None)
        if parent and parent.get("role") == "user":
            convo.append({
                "input": parent["text"],
                "output": entry["text"]
            })

    #convo.append({
    #    "instruction": instruction,
    #    "input": user_input,
    #    "output": bot_response,
    #    "text": full_instruction + "\n" + bot_response
    #})
    

    

print(f"โœ… Got {len(convo)} usable Q&A pairs.")

# Save unfiltered data
unfiltered_path = "./data/unfiltered_data.jsonl"
with open(unfiltered_path, "w", encoding="utf-8") as f:
    for line in convo:
        f.write(json.dumps(line, ensure_ascii=False) + "\n")

print(f"๐Ÿ“ Saved unfiltered data to {unfiltered_path}")

# Run filtering
print("๐Ÿšฟ Starting filtering...")
filterdata(convo)