Upload dataset_builder.py with huggingface_hub

e69482a verified 12 days ago

18.6 kB

	"""
	Dataset Builder v3 — Football Prediction Extractor
	- Always outputs JSON array (even single tip)
	- 70% single-tip / 30% multi-tip (2-4 events)
	- Noise: random emojis, typos, missing fields, varied separators
	- Varied date formats, bookmakers, times, headers
	- Pure stdlib — no pip installs needed
	"""

	import csv
	import json
	import random
	from pathlib import Path
	from collections import defaultdict

	# ─────────────────────────────────────────────
	# CONFIG
	# ─────────────────────────────────────────────
	TEAMS_CSV = "teams_tier1_tier2.csv"
	OUTPUT_TRAIN = "train_dataset.jsonl"
	OUTPUT_VAL = "val_dataset.jsonl"
	EXAMPLES_COUNT = 300
	VAL_SPLIT = 0.1

	# ─────────────────────────────────────────────
	# SYSTEM PROMPT — always array
	# ─────────────────────────────────────────────
	SYSTEM_PROMPT = (
	"You are a football data extraction assistant. "
	"Extract structured data from the message and return ONLY a valid JSON array. "
	"Each object in the array must have exactly these keys: "
	"league, team_1, team_2, prediction, date, odds. "
	"If a field is missing, use null. No extra text, no markdown."
	)

	# ─────────────────────────────────────────────
	# VOCABULARY
	# ─────────────────────────────────────────────
	PREDICTIONS = [
	"Over 1.5", "Over 2.5", "Over 3.5",
	"Under 2.5", "Under 3.5",
	"1X", "X2", "12",
	"Home Win", "Away Win", "Draw",
	"Both Teams to Score",
	"Home Win or Draw",
	"Away Win or Draw",
	"GG", "NG",
	]

	DATE_FORMATS = [
	lambda d, m, y: f"{d:02d}/{m:02d}/{y}",
	lambda d, m, y: f"{d:02d}-{m:02d}-{y}",
	lambda d, m, y: f"{d:02d}.{m:02d}.{y}",
	lambda d, m, y: f"{['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'][m-1]} {d}, {y}",
	lambda d, m, y: f"{d} {['January','February','March','April','May','June','July','August','September','October','November','December'][m-1]} {y}",
	]

	TIMES = ["13:00", "15:00", "16:00", "17:00", "18:00", "19:00", "19:45", "20:00", "20:45", "21:00", "21:45"]
	BOOKS = ["BETANO", "Bet365", "William Hill", "Unibet", "1xBet", "Betway", "Bwin", "Pinnacle"]
	HEADERS = ["Prediction of the Day", "Football Tip", "Best Bet Today", "Daily Pick", "Top Prediction", "Sure Tip", "VIP Prediction"]
	SEPARATORS = [" - ", " vs ", " v ", " – ", " VS ", " x "]
	EXTRA_EMOJIS = ["🔥","💥","🎯","👀","💰","🏅","⚡️","🙌","👇","✨","📊","💎","🤑","🚨","✅","❇️","🆕","📌","👑","🃏"]

	MULTI_HEADERS = [
	"⚽️ 𝐏𝐫𝐞𝐝𝐢𝐜𝐭𝐢𝐨𝐧𝐬 𝐨𝐟 𝐭𝐡𝐞 𝐃𝐚𝐲 ⚽️",
	"🔥 TODAY'S FOOTBALL TIPS 🔥",
	"💰 Daily Predictions 💰",
	"⚡️ Best Bets Today ⚡️",
	"📊 Football Tips",
	"🎯 Today's Picks",
	]

	MULTI_FOOTERS = [
	"For more predictions visit www.eaglepredict.com",
	"Follow us for daily tips! 🙌",
	"Good luck everyone! 🍀",
	"Join our VIP channel for more! 💎",
	"Win big today! 🤑",
	"", # no footer sometimes
	]

	# ─────────────────────────────────────────────
	# SINGLE TIP TEMPLATES
	# placeholders: {league} {team_1} {team_2} {prediction}
	# {date} {odds} {time} {header} {book} {sep}
	# templates 7 and 8 intentionally omit odds/date
	# ─────────────────────────────────────────────
	SINGLE_TEMPLATES = [
	# 1 structured Telegram bold style
	"⚽️ {header} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",

	# 2 plain structured
	"⚽️ {header} ⚽️\nDate: {date}\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nKick off: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",

	# 3 emoji compact
	"🏆 {league}\n{team_1}{sep}{team_2}\n📅 {date} \| ⏰ {time}\n🎯 Tip: {prediction}\n💰 Odds: {odds}",

	# 4 casual noisy
	"wow predictions present\nINCREDIBLE MATCH BETWEEN {team_1}{sep}{team_2}\nTime: {time}\nwe forecast {prediction}\nOdds {odds}",

	# 5 one-liner
	"{team_1}{sep}{team_2} \| {league} \| {date} \| {prediction} @ {odds}",

	# 6 verbose channel
	"🔥 Today's football tip 🔥\nCompetition: {league}\nGame: {team_1}{sep}{team_2}\nDate: {date}, KO {time}\nOur pick: {prediction}\nBest odds: {odds} ({book})\nGood luck! ⚽",

	# 7 minimal no emojis
	"Match: {team_1}{sep}{team_2}\nLeague: {league}\nDate: {date}\nPrediction: {prediction}\nOdds: {odds}",

	# 8 different field order
	"📆 {date} \| {time}\n⚽ {league}: {team_1}{sep}{team_2}\n✔️ {prediction} \| @{odds}",

	# 9 ALL CAPS noisy
	"MATCH: {team_1}{sep}{team_2}\nLEAGUE: {league}\nDATE: {date}\nPICK: {prediction}\nODDS: {odds}",

	# 10 missing odds intentionally
	"⚽️ {header}\n{league}\n{team_1}{sep}{team_2}\n{date}\nPrediction: {prediction}",

	# 11 missing date intentionally
	"🏟️ {league}\n{team_1}{sep}{team_2}\nTip: {prediction}\nOdds: {odds} on {book}",

	# 12 missing league intentionally
	"{team_1}{sep}{team_2}\n📅 {date}\n✅ {prediction} @ {odds}",

	# 13 telegram minimal
	"📌 {league}\n{team_1}{sep}{team_2} — {date}\n{prediction} \| {odds}",

	# 14 with extra commentary noise
	"Today I really like this match 👇\n{team_1}{sep}{team_2} ({league})\nDate: {date}\nMy pick: {prediction}\nOdds: {odds} on {book}",
	]

	# ─────────────────────────────────────────────
	# MULTI-TIP BLOCK TEMPLATES (per tip)
	# extra placeholder: {n} = tip number
	# ─────────────────────────────────────────────
	MULTI_BLOCK_TEMPLATES = [
	# Telegram numbered bold
	"⚽️ 𝗙𝗼𝗼𝘁𝗯𝗮𝗹𝗹 𝗧𝗶𝗽 {n} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}",

	# plain numbered
	"Tip {n}:\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nDate: {date} \| KO: {time}\nPrediction: {prediction} @ {odds}",

	# compact numbered
	"#{n} {league} \| {team_1}{sep}{team_2} \| {date}\n→ {prediction} @ {odds}",

	# emoji numbered
	"🎯 Pick #{n}\n{team_1}{sep}{team_2} ({league})\n📅 {date} ⏰ {time}\n✅ {prediction} \| odds: {odds}",

	# minimal numbered
	"{n}. {team_1}{sep}{team_2} — {league} — {prediction} @ {odds} ({date})",
	]

	# ─────────────────────────────────────────────
	# LOAD TEAMS FROM CSV
	# ─────────────────────────────────────────────
	def load_teams(csv_path: str) -> dict:
	leagues = defaultdict(list)
	path = Path(csv_path)
	if not path.exists():
	raise FileNotFoundError(f"CSV not found: {csv_path}")
	with open(path, encoding="utf-8") as f:
	sample = f.read(2048)
	f.seek(0)
	delimiter = "\t" if "\t" in sample else ","
	reader = csv.DictReader(f, delimiter=delimiter)
	for row in reader:
	row = {k.strip(): v.strip() for k, v in row.items()}
	country = row.get("Country", "")
	league = row.get("League", "")
	team = row.get("Team", "")
	if country and league and team:
	leagues[(country, league)].append(team)
	total = sum(len(v) for v in leagues.values())
	print(f"[✓] Loaded {total} teams across {len(leagues)} leagues")
	return leagues

	# ─────────────────────────────────────────────
	# RANDOM HELPERS
	# ─────────────────────────────────────────────
	def random_date() -> str:
	month = random.randint(8, 12) if random.random() < 0.5 else random.randint(1, 5)
	year = 2025 if month >= 8 else 2026
	day = random.randint(1, 28)
	return random.choice(DATE_FORMATS)(day, month, year)

	def random_odds() -> float:
	return round(random.uniform(1.05, 3.50), 2)

	def random_fixture(leagues: dict) -> dict \| None:
	key = random.choice(list(leagues.keys()))
	teams = leagues[key]
	if len(teams) < 2:
	return None
	_, league = key
	team_1, team_2 = random.sample(teams, 2)
	return {
	"league": league,
	"team_1": team_1,
	"team_2": team_2,
	"prediction": random.choice(PREDICTIONS),
	"date": random_date(),
	"odds": random_odds(),
	}

	# ─────────────────────────────────────────────
	# NOISE FUNCTIONS
	# ─────────────────────────────────────────────
	def inject_emojis(text: str) -> str:
	"""40% chance: sprinkle 1-3 random emojis into random lines."""
	if random.random() < 0.40:
	emojis = random.sample(EXTRA_EMOJIS, k=random.randint(1, 3))
	lines = text.split("\n")
	for e in emojis:
	idx = random.randint(0, len(lines) - 1)
	lines[idx] = (e + " " + lines[idx]) if random.random() < 0.5 else (lines[idx] + " " + e)
	return "\n".join(lines)
	return text

	def inject_typos(text: str) -> str:
	"""15% chance: swap two adjacent chars in a random word."""
	if random.random() < 0.15:
	words = text.split(" ")
	idx = random.randint(0, len(words) - 1)
	w = words[idx]
	if len(w) > 3 and w.isalpha():
	i = random.randint(0, len(w) - 2)
	w = w[:i] + w[i+1] + w[i] + w[i+2:]
	words[idx] = w
	return " ".join(words)
	return text

	def inject_extra_lines(text: str) -> str:
	"""20% chance: add irrelevant noise lines."""
	noise_lines = [
	"For more predictions visit www.eaglepredict.com",
	"Join our VIP channel 💎",
	"Yesterday result: WIN ✅",
	"Record this week: 8W 2L",
	"All tips are for 18+ only",
	"Use responsible gambling 🙏",
	]
	if random.random() < 0.20:
	line = random.choice(noise_lines)
	if random.random() < 0.5:
	return line + "\n" + text
	else:
	return text + "\n" + line
	return text

	def maybe_null_field(fixture: dict, has_odds: bool, has_date: bool, has_league: bool) -> dict:
	"""
	Randomly null out one field (20% chance).
	Respects whether template already omits it.
	"""
	f = dict(fixture)
	if not has_odds:
	f["odds"] = None
	if not has_date:
	f["date"] = None
	if not has_league:
	f["league"] = None
	# extra random null on top
	if random.random() < 0.20:
	field = random.choice(["odds", "date", "league"])
	f[field] = None
	return f

	def apply_noise(text: str) -> str:
	text = inject_emojis(text)
	text = inject_typos(text)
	text = inject_extra_lines(text)
	return text

	# ─────────────────────────────────────────────
	# EXAMPLE GENERATORS
	# ─────────────────────────────────────────────
	def make_single_example(leagues: dict) -> dict \| None:
	fixture = random_fixture(leagues)
	if not fixture:
	return None

	template = random.choice(SINGLE_TEMPLATES)
	has_odds = "{odds}" in template
	has_date = "{date}" in template
	has_league= "{league}" in template
	sep = random.choice(SEPARATORS)

	input_text = template.format(
	sep = sep,
	league = fixture["league"],
	team_1 = fixture["team_1"],
	team_2 = fixture["team_2"],
	prediction = fixture["prediction"],
	date = fixture["date"],
	odds = fixture["odds"],
	time = random.choice(TIMES),
	header = random.choice(HEADERS),
	book = random.choice(BOOKS),
	)
	input_text = apply_noise(input_text)
	output_json = maybe_null_field(fixture, has_odds, has_date, has_league)

	return {
	"input": input_text,
	"output": [output_json], # always array
	}


	def make_multi_example(leagues: dict) -> dict \| None:
	n_tips = random.randint(2, 4)
	fixtures = [random_fixture(leagues) for _ in range(n_tips * 2)]
	fixtures = [f for f in fixtures if f][:n_tips]
	if len(fixtures) < 2:
	return None

	block_template = random.choice(MULTI_BLOCK_TEMPLATES)
	sep = random.choice(SEPARATORS)
	blocks = []

	for i, f in enumerate(fixtures, 1):
	has_odds = "{odds}" in block_template
	has_date = "{date}" in block_template
	has_league= "{league}" in block_template
	block = block_template.format(
	n = i,
	sep = sep,
	league = f["league"],
	team_1 = f["team_1"],
	team_2 = f["team_2"],
	prediction = f["prediction"],
	date = f["date"],
	odds = f["odds"],
	time = random.choice(TIMES),
	book = random.choice(BOOKS),
	)
	blocks.append((block, f, has_odds, has_date, has_league))

	header = random.choice(MULTI_HEADERS)
	footer = random.choice(MULTI_FOOTERS)
	parts = [header] + [b[0] for b in blocks] + ([footer] if footer else [])
	input_text = "\n".join(parts)
	input_text = apply_noise(input_text)

	output = [
	maybe_null_field(f, has_odds, has_date, has_league)
	for _, f, has_odds, has_date, has_league in blocks
	]

	return {"input": input_text, "output": output}

	# ─────────────────────────────────────────────
	# FORMAT AS TRAINING EXAMPLE
	# ─────────────────────────────────────────────
	def make_training_example(ex: dict) -> dict:
	return {
	"messages": [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": ex["input"].strip()},
	{"role": "assistant", "content": json.dumps(ex["output"], ensure_ascii=False)},
	]
	}

	# ─────────────────────────────────────────────
	# MAIN
	# ─────────────────────────────────────────────
	def build_dataset():
	leagues = load_teams(TEAMS_CSV)
	examples = []

	n_single = int(EXAMPLES_COUNT * 0.70)
	n_multi = EXAMPLES_COUNT - n_single
	print(f"[1/2] Generating {n_single} single-tip + {n_multi} multi-tip examples...")

	# single tip
	attempts = 0
	while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1]) < n_single:
	attempts += 1
	if attempts > n_single * 5:
	break
	ex = make_single_example(leagues)
	if ex:
	examples.append(make_training_example(ex))

	# multi tip
	attempts = 0
	while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1]) < n_multi:
	attempts += 1
	if attempts > n_multi * 5:
	break
	ex = make_multi_example(leagues)
	if ex:
	examples.append(make_training_example(ex))

	print(f" → {len(examples)} total examples generated")

	# ── Write files ────────────────────────────
	print("[2/2] Writing dataset files...")
	random.shuffle(examples)
	split = int(len(examples) * (1 - VAL_SPLIT))
	train, val = examples[:split], examples[split:]

	for path, data in [(OUTPUT_TRAIN, train), (OUTPUT_VAL, val)]:
	with open(path, "w", encoding="utf-8") as f:
	for ex in data:
	f.write(json.dumps(ex, ensure_ascii=False) + "\n")

	# ── Stats ──────────────────────────────────
	all_ex = train + val
	single = sum(1 for e in all_ex if len(json.loads(e["messages"][2]["content"])) == 1)
	multi = len(all_ex) - single
	nulls = sum(
	1 for e in all_ex
	for obj in json.loads(e["messages"][2]["content"])
	if any(v is None for v in obj.values())
	)

	print(f"\n✅ Done!")
	print(f" {OUTPUT_TRAIN} → {len(train)} examples")
	print(f" {OUTPUT_VAL} → {len(val)} examples")
	print(f" Single-tip → {single}")
	print(f" Multi-tip → {multi}")
	print(f" With null fields→ {nulls}")

	# ── Previews ───────────────────────────────
	print("\n── Single-tip sample ───────────────────────")
	s = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1)
	for msg in s["messages"]:
	print(f"[{msg['role']}]\n{msg['content'][:200]}\n")

	print("── Multi-tip sample ────────────────────────")
	m = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1)
	for msg in m["messages"]:
	print(f"[{msg['role']}]\n{msg['content'][:300]}\n")


	if __name__ == "__main__":
	build_dataset()