forecast-extractor / dataset_builder.py
philippotiger's picture
Upload dataset_builder.py with huggingface_hub
e69482a verified
"""
Dataset Builder v3 — Football Prediction Extractor
- Always outputs JSON array (even single tip)
- 70% single-tip / 30% multi-tip (2-4 events)
- Noise: random emojis, typos, missing fields, varied separators
- Varied date formats, bookmakers, times, headers
- Pure stdlib — no pip installs needed
"""
import csv
import json
import random
from pathlib import Path
from collections import defaultdict
# ─────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────
TEAMS_CSV = "teams_tier1_tier2.csv"
OUTPUT_TRAIN = "train_dataset.jsonl"
OUTPUT_VAL = "val_dataset.jsonl"
EXAMPLES_COUNT = 300
VAL_SPLIT = 0.1
# ─────────────────────────────────────────────
# SYSTEM PROMPT — always array
# ─────────────────────────────────────────────
SYSTEM_PROMPT = (
"You are a football data extraction assistant. "
"Extract structured data from the message and return ONLY a valid JSON array. "
"Each object in the array must have exactly these keys: "
"league, team_1, team_2, prediction, date, odds. "
"If a field is missing, use null. No extra text, no markdown."
)
# ─────────────────────────────────────────────
# VOCABULARY
# ─────────────────────────────────────────────
PREDICTIONS = [
"Over 1.5", "Over 2.5", "Over 3.5",
"Under 2.5", "Under 3.5",
"1X", "X2", "12",
"Home Win", "Away Win", "Draw",
"Both Teams to Score",
"Home Win or Draw",
"Away Win or Draw",
"GG", "NG",
]
DATE_FORMATS = [
lambda d, m, y: f"{d:02d}/{m:02d}/{y}",
lambda d, m, y: f"{d:02d}-{m:02d}-{y}",
lambda d, m, y: f"{d:02d}.{m:02d}.{y}",
lambda d, m, y: f"{['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'][m-1]} {d}, {y}",
lambda d, m, y: f"{d} {['January','February','March','April','May','June','July','August','September','October','November','December'][m-1]} {y}",
]
TIMES = ["13:00", "15:00", "16:00", "17:00", "18:00", "19:00", "19:45", "20:00", "20:45", "21:00", "21:45"]
BOOKS = ["BETANO", "Bet365", "William Hill", "Unibet", "1xBet", "Betway", "Bwin", "Pinnacle"]
HEADERS = ["Prediction of the Day", "Football Tip", "Best Bet Today", "Daily Pick", "Top Prediction", "Sure Tip", "VIP Prediction"]
SEPARATORS = [" - ", " vs ", " v ", " – ", " VS ", " x "]
EXTRA_EMOJIS = ["🔥","💥","🎯","👀","💰","🏅","⚡️","🙌","👇","✨","📊","💎","🤑","🚨","✅","❇️","🆕","📌","👑","🃏"]
MULTI_HEADERS = [
"⚽️ 𝐏𝐫𝐞𝐝𝐢𝐜𝐭𝐢𝐨𝐧𝐬 𝐨𝐟 𝐭𝐡𝐞 𝐃𝐚𝐲 ⚽️",
"🔥 TODAY'S FOOTBALL TIPS 🔥",
"💰 Daily Predictions 💰",
"⚡️ Best Bets Today ⚡️",
"📊 Football Tips",
"🎯 Today's Picks",
]
MULTI_FOOTERS = [
"For more predictions visit www.eaglepredict.com",
"Follow us for daily tips! 🙌",
"Good luck everyone! 🍀",
"Join our VIP channel for more! 💎",
"Win big today! 🤑",
"", # no footer sometimes
]
# ─────────────────────────────────────────────
# SINGLE TIP TEMPLATES
# placeholders: {league} {team_1} {team_2} {prediction}
# {date} {odds} {time} {header} {book} {sep}
# templates 7 and 8 intentionally omit odds/date
# ─────────────────────────────────────────────
SINGLE_TEMPLATES = [
# 1 structured Telegram bold style
"⚽️ {header} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",
# 2 plain structured
"⚽️ {header} ⚽️\nDate: {date}\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nKick off: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",
# 3 emoji compact
"🏆 {league}\n{team_1}{sep}{team_2}\n📅 {date} | ⏰ {time}\n🎯 Tip: {prediction}\n💰 Odds: {odds}",
# 4 casual noisy
"wow predictions present\nINCREDIBLE MATCH BETWEEN {team_1}{sep}{team_2}\nTime: {time}\nwe forecast {prediction}\nOdds {odds}",
# 5 one-liner
"{team_1}{sep}{team_2} | {league} | {date} | {prediction} @ {odds}",
# 6 verbose channel
"🔥 Today's football tip 🔥\nCompetition: {league}\nGame: {team_1}{sep}{team_2}\nDate: {date}, KO {time}\nOur pick: {prediction}\nBest odds: {odds} ({book})\nGood luck! ⚽",
# 7 minimal no emojis
"Match: {team_1}{sep}{team_2}\nLeague: {league}\nDate: {date}\nPrediction: {prediction}\nOdds: {odds}",
# 8 different field order
"📆 {date} | {time}\n⚽ {league}: {team_1}{sep}{team_2}\n✔️ {prediction} | @{odds}",
# 9 ALL CAPS noisy
"MATCH: {team_1}{sep}{team_2}\nLEAGUE: {league}\nDATE: {date}\nPICK: {prediction}\nODDS: {odds}",
# 10 missing odds intentionally
"⚽️ {header}\n{league}\n{team_1}{sep}{team_2}\n{date}\nPrediction: {prediction}",
# 11 missing date intentionally
"🏟️ {league}\n{team_1}{sep}{team_2}\nTip: {prediction}\nOdds: {odds} on {book}",
# 12 missing league intentionally
"{team_1}{sep}{team_2}\n📅 {date}\n✅ {prediction} @ {odds}",
# 13 telegram minimal
"📌 {league}\n{team_1}{sep}{team_2} — {date}\n{prediction} | {odds}",
# 14 with extra commentary noise
"Today I really like this match 👇\n{team_1}{sep}{team_2} ({league})\nDate: {date}\nMy pick: {prediction}\nOdds: {odds} on {book}",
]
# ─────────────────────────────────────────────
# MULTI-TIP BLOCK TEMPLATES (per tip)
# extra placeholder: {n} = tip number
# ─────────────────────────────────────────────
MULTI_BLOCK_TEMPLATES = [
# Telegram numbered bold
"⚽️ 𝗙𝗼𝗼𝘁𝗯𝗮𝗹𝗹 𝗧𝗶𝗽 {n} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",
# plain numbered
"Tip {n}:\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nDate: {date} | KO: {time}\nPrediction: {prediction} @ {odds}",
# compact numbered
"#{n} {league} | {team_1}{sep}{team_2} | {date}\n→ {prediction} @ {odds}",
# emoji numbered
"🎯 Pick #{n}\n{team_1}{sep}{team_2} ({league})\n📅 {date} ⏰ {time}\n✅ {prediction} | odds: {odds}",
# minimal numbered
"{n}. {team_1}{sep}{team_2} — {league} — {prediction} @ {odds} ({date})",
]
# ─────────────────────────────────────────────
# LOAD TEAMS FROM CSV
# ─────────────────────────────────────────────
def load_teams(csv_path: str) -> dict:
leagues = defaultdict(list)
path = Path(csv_path)
if not path.exists():
raise FileNotFoundError(f"CSV not found: {csv_path}")
with open(path, encoding="utf-8") as f:
sample = f.read(2048)
f.seek(0)
delimiter = "\t" if "\t" in sample else ","
reader = csv.DictReader(f, delimiter=delimiter)
for row in reader:
row = {k.strip(): v.strip() for k, v in row.items()}
country = row.get("Country", "")
league = row.get("League", "")
team = row.get("Team", "")
if country and league and team:
leagues[(country, league)].append(team)
total = sum(len(v) for v in leagues.values())
print(f"[✓] Loaded {total} teams across {len(leagues)} leagues")
return leagues
# ─────────────────────────────────────────────
# RANDOM HELPERS
# ─────────────────────────────────────────────
def random_date() -> str:
month = random.randint(8, 12) if random.random() < 0.5 else random.randint(1, 5)
year = 2025 if month >= 8 else 2026
day = random.randint(1, 28)
return random.choice(DATE_FORMATS)(day, month, year)
def random_odds() -> float:
return round(random.uniform(1.05, 3.50), 2)
def random_fixture(leagues: dict) -> dict | None:
key = random.choice(list(leagues.keys()))
teams = leagues[key]
if len(teams) < 2:
return None
_, league = key
team_1, team_2 = random.sample(teams, 2)
return {
"league": league,
"team_1": team_1,
"team_2": team_2,
"prediction": random.choice(PREDICTIONS),
"date": random_date(),
"odds": random_odds(),
}
# ─────────────────────────────────────────────
# NOISE FUNCTIONS
# ─────────────────────────────────────────────
def inject_emojis(text: str) -> str:
"""40% chance: sprinkle 1-3 random emojis into random lines."""
if random.random() < 0.40:
emojis = random.sample(EXTRA_EMOJIS, k=random.randint(1, 3))
lines = text.split("\n")
for e in emojis:
idx = random.randint(0, len(lines) - 1)
lines[idx] = (e + " " + lines[idx]) if random.random() < 0.5 else (lines[idx] + " " + e)
return "\n".join(lines)
return text
def inject_typos(text: str) -> str:
"""15% chance: swap two adjacent chars in a random word."""
if random.random() < 0.15:
words = text.split(" ")
idx = random.randint(0, len(words) - 1)
w = words[idx]
if len(w) > 3 and w.isalpha():
i = random.randint(0, len(w) - 2)
w = w[:i] + w[i+1] + w[i] + w[i+2:]
words[idx] = w
return " ".join(words)
return text
def inject_extra_lines(text: str) -> str:
"""20% chance: add irrelevant noise lines."""
noise_lines = [
"For more predictions visit www.eaglepredict.com",
"Join our VIP channel 💎",
"Yesterday result: WIN ✅",
"Record this week: 8W 2L",
"All tips are for 18+ only",
"Use responsible gambling 🙏",
]
if random.random() < 0.20:
line = random.choice(noise_lines)
if random.random() < 0.5:
return line + "\n" + text
else:
return text + "\n" + line
return text
def maybe_null_field(fixture: dict, has_odds: bool, has_date: bool, has_league: bool) -> dict:
"""
Randomly null out one field (20% chance).
Respects whether template already omits it.
"""
f = dict(fixture)
if not has_odds:
f["odds"] = None
if not has_date:
f["date"] = None
if not has_league:
f["league"] = None
# extra random null on top
if random.random() < 0.20:
field = random.choice(["odds", "date", "league"])
f[field] = None
return f
def apply_noise(text: str) -> str:
text = inject_emojis(text)
text = inject_typos(text)
text = inject_extra_lines(text)
return text
# ─────────────────────────────────────────────
# EXAMPLE GENERATORS
# ─────────────────────────────────────────────
def make_single_example(leagues: dict) -> dict | None:
fixture = random_fixture(leagues)
if not fixture:
return None
template = random.choice(SINGLE_TEMPLATES)
has_odds = "{odds}" in template
has_date = "{date}" in template
has_league= "{league}" in template
sep = random.choice(SEPARATORS)
input_text = template.format(
sep = sep,
league = fixture["league"],
team_1 = fixture["team_1"],
team_2 = fixture["team_2"],
prediction = fixture["prediction"],
date = fixture["date"],
odds = fixture["odds"],
time = random.choice(TIMES),
header = random.choice(HEADERS),
book = random.choice(BOOKS),
)
input_text = apply_noise(input_text)
output_json = maybe_null_field(fixture, has_odds, has_date, has_league)
return {
"input": input_text,
"output": [output_json], # always array
}
def make_multi_example(leagues: dict) -> dict | None:
n_tips = random.randint(2, 4)
fixtures = [random_fixture(leagues) for _ in range(n_tips * 2)]
fixtures = [f for f in fixtures if f][:n_tips]
if len(fixtures) < 2:
return None
block_template = random.choice(MULTI_BLOCK_TEMPLATES)
sep = random.choice(SEPARATORS)
blocks = []
for i, f in enumerate(fixtures, 1):
has_odds = "{odds}" in block_template
has_date = "{date}" in block_template
has_league= "{league}" in block_template
block = block_template.format(
n = i,
sep = sep,
league = f["league"],
team_1 = f["team_1"],
team_2 = f["team_2"],
prediction = f["prediction"],
date = f["date"],
odds = f["odds"],
time = random.choice(TIMES),
book = random.choice(BOOKS),
)
blocks.append((block, f, has_odds, has_date, has_league))
header = random.choice(MULTI_HEADERS)
footer = random.choice(MULTI_FOOTERS)
parts = [header] + [b[0] for b in blocks] + ([footer] if footer else [])
input_text = "\n".join(parts)
input_text = apply_noise(input_text)
output = [
maybe_null_field(f, has_odds, has_date, has_league)
for _, f, has_odds, has_date, has_league in blocks
]
return {"input": input_text, "output": output}
# ─────────────────────────────────────────────
# FORMAT AS TRAINING EXAMPLE
# ─────────────────────────────────────────────
def make_training_example(ex: dict) -> dict:
return {
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": ex["input"].strip()},
{"role": "assistant", "content": json.dumps(ex["output"], ensure_ascii=False)},
]
}
# ─────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────
def build_dataset():
leagues = load_teams(TEAMS_CSV)
examples = []
n_single = int(EXAMPLES_COUNT * 0.70)
n_multi = EXAMPLES_COUNT - n_single
print(f"[1/2] Generating {n_single} single-tip + {n_multi} multi-tip examples...")
# single tip
attempts = 0
while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1]) < n_single:
attempts += 1
if attempts > n_single * 5:
break
ex = make_single_example(leagues)
if ex:
examples.append(make_training_example(ex))
# multi tip
attempts = 0
while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1]) < n_multi:
attempts += 1
if attempts > n_multi * 5:
break
ex = make_multi_example(leagues)
if ex:
examples.append(make_training_example(ex))
print(f" → {len(examples)} total examples generated")
# ── Write files ────────────────────────────
print("[2/2] Writing dataset files...")
random.shuffle(examples)
split = int(len(examples) * (1 - VAL_SPLIT))
train, val = examples[:split], examples[split:]
for path, data in [(OUTPUT_TRAIN, train), (OUTPUT_VAL, val)]:
with open(path, "w", encoding="utf-8") as f:
for ex in data:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
# ── Stats ──────────────────────────────────
all_ex = train + val
single = sum(1 for e in all_ex if len(json.loads(e["messages"][2]["content"])) == 1)
multi = len(all_ex) - single
nulls = sum(
1 for e in all_ex
for obj in json.loads(e["messages"][2]["content"])
if any(v is None for v in obj.values())
)
print(f"\n✅ Done!")
print(f" {OUTPUT_TRAIN}{len(train)} examples")
print(f" {OUTPUT_VAL}{len(val)} examples")
print(f" Single-tip → {single}")
print(f" Multi-tip → {multi}")
print(f" With null fields→ {nulls}")
# ── Previews ───────────────────────────────
print("\n── Single-tip sample ───────────────────────")
s = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1)
for msg in s["messages"]:
print(f"[{msg['role']}]\n{msg['content'][:200]}\n")
print("── Multi-tip sample ────────────────────────")
m = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1)
for msg in m["messages"]:
print(f"[{msg['role']}]\n{msg['content'][:300]}\n")
if __name__ == "__main__":
build_dataset()