MiniGPT / customgen.py
CreatedNull's picture
Upload folder using huggingface_hub
79eec1d verified
import json
import random
from tqdm import tqdm
from transformers import AutoTokenizer
# CONFIG
tokenizer = AutoTokenizer.from_pretrained("gpt2")
MAX_TOKENS = 27
NUM_SAMPLES = 50000
SAVE_PATH = "./customgens/mini_qna_dataset.jsonl"
# Extended Templates with Paraphrasing
TEMPLATES = [
# WHY
("Why do {subject} {action}?", "Because {reason}."),
("What makes {subject} {action}?", "It's because {reason}."),
("Explain why {subject} {action}.", "{reason} is the reason."),
# WHAT IS
("What is {thing}?", "{thing} is {definition}."),
("Define {thing}.", "{thing} refers to {definition}."),
("Can you tell me what {thing} means?", "Sure! It's {definition}."),
# HOW
("How does {thing} work?", "It works by {mechanism}."),
("What's the mechanism behind {thing}?", "It involves {mechanism}."),
("Explain how {thing} functions.", "{mechanism} is how it works."),
# WHEN / CONDITION
("What happens when {condition}?", "{result}."),
("Describe what occurs if {condition}.", "Usually, {result}."),
("When {condition}, what takes place?", "The result is {result}."),
# IMPORTANCE
("Why is {thing} important?", "Because {importance}."),
("What makes {thing} important?", "{importance} is why."),
("Is {thing} important? Why?", "Yes, because {importance}."),
]
# Knowledge Bank
DATA = {
"animals": {
"subjects": ["cats", "dogs", "birds", "fish"],
"actions": ["sleep a lot", "bark", "fly", "swim"],
"reasons": [
"they conserve energy",
"they are nocturnal",
"it's in their nature",
"they communicate that way"
]
},
"science": {
"things": ["gravity", "photosynthesis", "a star", "an atom"],
"definitions": [
"a force that pulls objects together",
"the process plants use to make food",
"a burning ball of gas",
"the smallest unit of matter"
],
"mechanisms": [
"converting sunlight into energy",
"attracting objects with mass",
"splitting light into colors",
"colliding particles"
],
"conditions": ["you heat ice", "a star dies"],
"results": ["it melts", "it becomes a black hole"],
"importance": [
"it keeps us on Earth",
"it enables life on Earth"
]
},
"food": {
"things": ["a waffle", "chocolate", "rice", "milk"],
"definitions": [
"a sweet, crispy batter cake",
"a sweet made from cocoa",
"a grain eaten daily in Asia",
"a white liquid from cows"
],
"importance": [
"it provides energy",
"it’s part of daily nutrition"
]
}
}
TOPIC_COUNT = {k: 0 for k in DATA}
MAX_PER_TOPIC = NUM_SAMPLES // len(DATA)
def sample_topic():
options = [t for t in DATA if TOPIC_COUNT[t] < MAX_PER_TOPIC]
return random.choice(options) if options else None
def fill_template(template_pair, topic_data):
q_temp, a_temp = template_pair
replacements = {
"{subject}": random.choice(topic_data.get("subjects", topic_data.get("things", ["something"]))),
"{action}": random.choice(topic_data.get("actions", ["do things"])),
"{reason}": random.choice(topic_data.get("reasons", ["that’s how they survive"])),
"{thing}": random.choice(topic_data.get("things", ["a thing"])),
"{definition}": random.choice(topic_data.get("definitions", ["an object used every day"])),
"{mechanism}": random.choice(topic_data.get("mechanisms", ["processing energy"])),
"{condition}": random.choice(topic_data.get("conditions", ["a change occurs"])),
"{result}": random.choice(topic_data.get("results", ["it transforms"])),
"{importance}": random.choice(topic_data.get("importance", ["it is vital to survival"]))
}
q = q_temp
a = a_temp
for key, val in replacements.items():
q = q.replace(key, val)
a = a.replace(key, val)
return q.strip(), a.strip()
def maybe_add_noise(q, a):
rand = random.random()
if rand < 0.05:
a = "I'm not sure."
elif rand < 0.10:
q += " Just wondering."
a = "Well, " + a
return q, a
def token_count(text):
return len(tokenizer.encode(text))
def main():
with open(SAVE_PATH, "w", encoding="utf-8") as f:
total = 0
pbar = tqdm(total=NUM_SAMPLES)
while total < NUM_SAMPLES:
topic = sample_topic()
if not topic:
break
template = random.choice(TEMPLATES)
topic_data = DATA[topic]
question, answer = fill_template(template, topic_data)
question, answer = maybe_add_noise(question, answer)
combined = f"Q: {question} A: {answer}"
if token_count(combined) <= MAX_TOKENS:
record = {
"question": question,
"answer": answer,
"text": combined
}
f.write(json.dumps(record, ensure_ascii=False) + "\n")
total += 1
TOPIC_COUNT[topic] += 1
pbar.update(1)
if total % 5000 == 0:
print(f"\n[Sample {total}]")
print("Q:", question)
print("A:", answer)
print("Tokens:", token_count(combined))
pbar.close()
print(f"\n✅ Saved {total} samples to {SAVE_PATH}")
if __name__ == "__main__":
main()