File size: 2,396 Bytes
5907bd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import json
import random
from collections import defaultdict
from smart_open import smart_open

# URL to the dataset we're using.
dataset_url = "https://huggingface.co/datasets/jondurbin/airoboros-2.1/resolve/main/instructions.jsonl"

# Select the subset of data for each of our experts.
experts = {
  "qa": [
    "quiz",
    "multiple_choice",
    "contextual",
    "counterfactual_contextual"
  ],
  "creative": [
    "card",
    "writing",
    "experience",
    "song",
    "roleplay",
    "gtkm",
    "rp",
    "detailed_writing",
    "joke"
  ],
  "code": [
    "coding"
  ],
  "reasoning": [
    "cot",
    "theory_of_mind",
    "riddle",
    "orca"
  ],
  "function": [
    "agent",
    "plan"
  ],
  "general": [
    "wordgame",
    "trivia",
    "general"
  ]
}

# Map all of our training data into the categories per expert.
categories = defaultdict(list)
with smart_open(dataset_url, "r") as infile:
    for line in infile.readlines():
        item = json.loads(line)
        if not item.get("category"):
            continue
        categories[item["category"]].append(item)

# Include a random sampling of each expert's data in each other expert's dataset.
samples = {}
for expert, expert_cats in experts.items():
    samples[expert] = []
    for category in expert_cats:
        samples[expert] += random.sample(categories[category], int(len(categories[category]) * 0.15) or 1)

# Save the split datasets.
if not os.path.exists("training_data"):
    os.mkdir("training_data")
if not os.path.exists("routing_data"):
    os.mkdir("routing_data")
for expert, expert_cats in experts.items():
    with open(f"training_data/expert_{expert}.jsonl", "w") as outfile:
        # Also, be sure to include stylized responses so it adapts to system prompt well.
        for category in expert_cats + ["stylized_response"]:
            for item in categories[category]:
                outfile.write(json.dumps(item) + "\n")
        for other in samples:
            if other == expert:
                continue
            for item in samples[other]:
                outfile.write(json.dumps(item) + "\n")
    with open(f"routing_data/expert_{expert}.jsonl", "w") as outfile:
        for category in expert_cats:
            for item in categories[category]:
                outfile.write(json.dumps({"instruction": item.get("system", "A chat.") + " " + item["instruction"]}) + "\n")