File size: 1,979 Bytes
79eec1d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# datasetgen.py
import json
import random
from faker import Faker
from tqdm import tqdm
import os
fake = Faker()
OUTPUT_PATH = "data/filtered_data.jsonl"
#os.makedirs("datasets", exist_ok=True)
def generate_example():
"""Generates a single GPT-like QA pair"""
q_templates = [
"What is {}?",
"How do you {}?",
"Why is {} important?",
"Give me an example of {}.",
"Explain {} in simple terms.",
"Compare {} and {}.",
"What happens if {}?",
"Can you summarize {}?"
]
concepts = [
"machine learning", "quantum physics", "natural selection",
"photosynthesis", "neural networks", "global warming",
"black holes", "economic inflation", "probability", "blockchain"
]
actions = [
"train a neural network", "reduce carbon emissions", "make bread",
"calculate probability", "grow tomatoes", "optimize code",
"write a resume", "design a logo", "encrypt data", "learn Python"
]
concept = random.choice(concepts)
action = random.choice(actions)
template = random.choice(q_templates)
if '{}' in template and template.count('{}') == 1:
question = template.format(random.choice([concept, action]))
else:
question = template.format(concept, random.choice(concepts))
# Simulate an answer (in real GPT training you'd use real completions)
answer = f"{fake.paragraph(nb_sentences=4)}"
return {
"text": "^User: "+ question + "\nMiniGPT: " + answer + " <END>",
}
def generate_dataset(n=5000):
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
for _ in tqdm(range(n), desc="Generating Examples"):
example = generate_example()
f.write(json.dumps(example, ensure_ascii=False) + "\n")
print(f"\n✅ Dataset saved to: {OUTPUT_PATH}")
if __name__ == "__main__":
generate_dataset(5000)
|