File size: 1,979 Bytes

79eec1d

# datasetgen.py
import json
import random
from faker import Faker
from tqdm import tqdm
import os

fake = Faker()

OUTPUT_PATH = "data/filtered_data.jsonl"
#os.makedirs("datasets", exist_ok=True)

def generate_example():
    """Generates a single GPT-like QA pair"""
    q_templates = [
        "What is {}?",
        "How do you {}?",
        "Why is {} important?",
        "Give me an example of {}.",
        "Explain {} in simple terms.",
        "Compare {} and {}.",
        "What happens if {}?",
        "Can you summarize {}?"
    ]

    concepts = [
        "machine learning", "quantum physics", "natural selection",
        "photosynthesis", "neural networks", "global warming",
        "black holes", "economic inflation", "probability", "blockchain"
    ]

    actions = [
        "train a neural network", "reduce carbon emissions", "make bread",
        "calculate probability", "grow tomatoes", "optimize code",
        "write a resume", "design a logo", "encrypt data", "learn Python"
    ]

    concept = random.choice(concepts)
    action = random.choice(actions)

    template = random.choice(q_templates)

    if '{}' in template and template.count('{}') == 1:
        question = template.format(random.choice([concept, action]))
    else:
        question = template.format(concept, random.choice(concepts))

    # Simulate an answer (in real GPT training you'd use real completions)
    answer = f"{fake.paragraph(nb_sentences=4)}"

    return {
        "text": "^User: "+ question + "\nMiniGPT: " + answer + " <END>",
    }

def generate_dataset(n=5000):
    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
        for _ in tqdm(range(n), desc="Generating Examples"):
            example = generate_example()
            f.write(json.dumps(example, ensure_ascii=False) + "\n")

    print(f"\n✅ Dataset saved to: {OUTPUT_PATH}")

if __name__ == "__main__":
    generate_dataset(5000)