MiniGPT / datasetgen2.py
CreatedNull's picture
Upload folder using huggingface_hub
79eec1d verified
# datasetgen.py
import json
import random
from faker import Faker
from tqdm import tqdm
import os
fake = Faker()
OUTPUT_PATH = "data/filtered_data.jsonl"
#os.makedirs("datasets", exist_ok=True)
def generate_example():
"""Generates a single GPT-like QA pair"""
q_templates = [
"What is {}?",
"How do you {}?",
"Why is {} important?",
"Give me an example of {}.",
"Explain {} in simple terms.",
"Compare {} and {}.",
"What happens if {}?",
"Can you summarize {}?"
]
concepts = [
"machine learning", "quantum physics", "natural selection",
"photosynthesis", "neural networks", "global warming",
"black holes", "economic inflation", "probability", "blockchain"
]
actions = [
"train a neural network", "reduce carbon emissions", "make bread",
"calculate probability", "grow tomatoes", "optimize code",
"write a resume", "design a logo", "encrypt data", "learn Python"
]
concept = random.choice(concepts)
action = random.choice(actions)
template = random.choice(q_templates)
if '{}' in template and template.count('{}') == 1:
question = template.format(random.choice([concept, action]))
else:
question = template.format(concept, random.choice(concepts))
# Simulate an answer (in real GPT training you'd use real completions)
answer = f"{fake.paragraph(nb_sentences=4)}"
return {
"text": "^User: "+ question + "\nMiniGPT: " + answer + " <END>",
}
def generate_dataset(n=5000):
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
for _ in tqdm(range(n), desc="Generating Examples"):
example = generate_example()
f.write(json.dumps(example, ensure_ascii=False) + "\n")
print(f"\n✅ Dataset saved to: {OUTPUT_PATH}")
if __name__ == "__main__":
generate_dataset(5000)