|
|
|
import json
|
|
import random
|
|
from faker import Faker
|
|
from tqdm import tqdm
|
|
import os
|
|
|
|
fake = Faker()
|
|
|
|
OUTPUT_PATH = "data/filtered_data.jsonl"
|
|
|
|
|
|
def generate_example():
|
|
"""Generates a single GPT-like QA pair"""
|
|
q_templates = [
|
|
"What is {}?",
|
|
"How do you {}?",
|
|
"Why is {} important?",
|
|
"Give me an example of {}.",
|
|
"Explain {} in simple terms.",
|
|
"Compare {} and {}.",
|
|
"What happens if {}?",
|
|
"Can you summarize {}?"
|
|
]
|
|
|
|
concepts = [
|
|
"machine learning", "quantum physics", "natural selection",
|
|
"photosynthesis", "neural networks", "global warming",
|
|
"black holes", "economic inflation", "probability", "blockchain"
|
|
]
|
|
|
|
actions = [
|
|
"train a neural network", "reduce carbon emissions", "make bread",
|
|
"calculate probability", "grow tomatoes", "optimize code",
|
|
"write a resume", "design a logo", "encrypt data", "learn Python"
|
|
]
|
|
|
|
concept = random.choice(concepts)
|
|
action = random.choice(actions)
|
|
|
|
template = random.choice(q_templates)
|
|
|
|
if '{}' in template and template.count('{}') == 1:
|
|
question = template.format(random.choice([concept, action]))
|
|
else:
|
|
question = template.format(concept, random.choice(concepts))
|
|
|
|
|
|
answer = f"{fake.paragraph(nb_sentences=4)}"
|
|
|
|
return {
|
|
"text": "^User: "+ question + "\nMiniGPT: " + answer + " <END>",
|
|
}
|
|
|
|
def generate_dataset(n=5000):
|
|
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
|
|
for _ in tqdm(range(n), desc="Generating Examples"):
|
|
example = generate_example()
|
|
f.write(json.dumps(example, ensure_ascii=False) + "\n")
|
|
|
|
print(f"\n✅ Dataset saved to: {OUTPUT_PATH}")
|
|
|
|
if __name__ == "__main__":
|
|
generate_dataset(5000)
|
|
|