File size: 3,238 Bytes
d48eef6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import csv
import os
import sys
from datetime import datetime
from ..scrape.config import FIGHTS_CSV_PATH, FIGHTERS_CSV_PATH

def load_fighters_data():
    """Loads fighter data, including ELO scores, into a dictionary."""
    if not os.path.exists(FIGHTERS_CSV_PATH):
        print(f"Error: Fighter data not found at '{FIGHTERS_CSV_PATH}'.")
        print("Please run the ELO analysis first ('python -m src.analysis.elo').")
        return None
    
    fighters = {}
    with open(FIGHTERS_CSV_PATH, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            full_name = f"{row['first_name']} {row['last_name']}".strip()
            fighters[full_name] = {'elo': float(row.get('elo', 1500))} # Default ELO if missing
    return fighters

def load_fights_data():
    """Loads fight data and sorts it chronologically."""
    if not os.path.exists(FIGHTS_CSV_PATH):
        print(f"Error: Fights data not found at '{FIGHTS_CSV_PATH}'.")
        return None

    with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
        fights = list(csv.DictReader(f))
    
    # Sort fights chronologically to ensure a proper train/test split later
    fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
    return fights

def run_elo_baseline_model(fights, fighters):
    """
    Runs a simple baseline prediction model where the fighter with the higher ELO is predicted to win.
    """
    correct_predictions = 0
    total_predictions = 0
    
    for fight in fights:
        fighter1_name = fight['fighter_1']
        fighter2_name = fight['fighter_2']
        actual_winner = fight['winner']

        # Skip fights that are draws or no contests
        if actual_winner in ["Draw", "NC", ""]:
            continue
            
        fighter1 = fighters.get(fighter1_name)
        fighter2 = fighters.get(fighter2_name)

        if not fighter1 or not fighter2:
            continue # Skip if fighter data is missing

        elo1 = fighter1.get('elo', 1500)
        elo2 = fighter2.get('elo', 1500)
        
        # Predict winner based on higher ELO
        predicted_winner = fighter1_name if elo1 > elo2 else fighter2_name
        
        if predicted_winner == actual_winner:
            correct_predictions += 1
        
        total_predictions += 1
        
    accuracy = (correct_predictions / total_predictions) * 100 if total_predictions > 0 else 0
    return accuracy, total_predictions

def main():
    """
    Main function to run the prediction pipeline.
    """
    print("--- Starting ML Prediction Pipeline ---")

    # Load data
    fighters_data = load_fighters_data()
    fights_data = load_fights_data()
    
    if not fighters_data or not fights_data:
        print("Aborting pipeline due to missing data.")
        return

    # Run baseline model
    print("\nRunning Baseline Model (Predicting winner by highest ELO)...")
    accuracy, total_fights = run_elo_baseline_model(fights_data, fighters_data)
    
    print("\n--- Baseline Model Evaluation ---")
    print(f"Total Fights Evaluated: {total_fights}")
    print(f"Model Accuracy: {accuracy:.2f}%")
    print("---------------------------------")

if __name__ == '__main__':
    main()