CreatedNull
/

MiniGPT

Text Generation

Model card Files Files and versions

MiniGPT / datasetgen2.py

CreatedNull's picture

Upload folder using huggingface_hub

79eec1d verified about 2 months ago

history blame contribute delete

1.98 kB

	# datasetgen.py
	import json
	import random
	from faker import Faker
	from tqdm import tqdm
	import os

	fake = Faker()

	OUTPUT_PATH = "data/filtered_data.jsonl"
	#os.makedirs("datasets", exist_ok=True)

	def generate_example():
	"""Generates a single GPT-like QA pair"""
	q_templates = [
	"What is {}?",
	"How do you {}?",
	"Why is {} important?",
	"Give me an example of {}.",
	"Explain {} in simple terms.",
	"Compare {} and {}.",
	"What happens if {}?",
	"Can you summarize {}?"
	]

	concepts = [
	"machine learning", "quantum physics", "natural selection",
	"photosynthesis", "neural networks", "global warming",
	"black holes", "economic inflation", "probability", "blockchain"
	]

	actions = [
	"train a neural network", "reduce carbon emissions", "make bread",
	"calculate probability", "grow tomatoes", "optimize code",
	"write a resume", "design a logo", "encrypt data", "learn Python"
	]

	concept = random.choice(concepts)
	action = random.choice(actions)

	template = random.choice(q_templates)

	if '{}' in template and template.count('{}') == 1:
	question = template.format(random.choice([concept, action]))
	else:
	question = template.format(concept, random.choice(concepts))

	# Simulate an answer (in real GPT training you'd use real completions)
	answer = f"{fake.paragraph(nb_sentences=4)}"

	return {
	"text": "^User: "+ question + "\nMiniGPT: " + answer + " <END>",
	}

	def generate_dataset(n=5000):
	with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
	for _ in tqdm(range(n), desc="Generating Examples"):
	example = generate_example()
	f.write(json.dumps(example, ensure_ascii=False) + "\n")

	print(f"\n✅ Dataset saved to: {OUTPUT_PATH}")

	if __name__ == "__main__":
	generate_dataset(5000)