Create src/main.py
Browse files- src/main.py +311 -0
src/main.py
ADDED
@@ -0,0 +1,311 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import torch.optim as optim
|
6 |
+
from torch.utils.data import DataLoader, TensorDataset
|
7 |
+
from torch.cuda.amp import autocast, GradScaler
|
8 |
+
from torch.optim.lr_scheduler import ReduceLROnPlateau
|
9 |
+
import numpy as np
|
10 |
+
import pandas as pd
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
from sklearn.model_selection import train_test_split
|
13 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
14 |
+
from sklearn.manifold import TSNE
|
15 |
+
from sklearn.cluster import DBSCAN
|
16 |
+
import optuna
|
17 |
+
|
18 |
+
# Set up logging
|
19 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
20 |
+
|
21 |
+
# Constants
|
22 |
+
RANDOM_SEED = 42
|
23 |
+
TEST_SIZE = 0.2
|
24 |
+
VALIDATION_SIZE = 200
|
25 |
+
|
26 |
+
def load_data(start_year=2000, end_year=2017):
|
27 |
+
dfs = []
|
28 |
+
for year in range(start_year, end_year + 1):
|
29 |
+
file_path = f'atp_matches_{year}.csv'
|
30 |
+
try:
|
31 |
+
df = pd.read_csv(file_path, low_memory=False)
|
32 |
+
required_columns = ['tourney_id', 'surface', 'winner_id', 'loser_id', 'winner_name', 'loser_name',
|
33 |
+
'winner_age', 'loser_age', 'winner_rank', 'loser_rank', 'tourney_date']
|
34 |
+
if not all(col in df.columns for col in required_columns):
|
35 |
+
logging.warning(f"File {file_path} is missing some required columns. Skipping this file.")
|
36 |
+
continue
|
37 |
+
dfs.append(df)
|
38 |
+
logging.info(f"Data loaded successfully from {file_path}")
|
39 |
+
except FileNotFoundError:
|
40 |
+
logging.warning(f"File not found: {file_path}")
|
41 |
+
except pd.errors.EmptyDataError:
|
42 |
+
logging.warning(f"Empty file: {file_path}")
|
43 |
+
except Exception as e:
|
44 |
+
logging.error(f"Error loading data from {file_path}: {str(e)}")
|
45 |
+
|
46 |
+
if not dfs:
|
47 |
+
raise ValueError("No data files were successfully loaded.")
|
48 |
+
|
49 |
+
combined_df = pd.concat(dfs, ignore_index=True)
|
50 |
+
if combined_df.empty:
|
51 |
+
raise ValueError("The combined DataFrame is empty after processing all files.")
|
52 |
+
return combined_df
|
53 |
+
|
54 |
+
def preprocess_data(df):
|
55 |
+
label_encoders = {}
|
56 |
+
for col in ['tourney_id', 'surface', 'winner_id', 'loser_id']:
|
57 |
+
df[col] = df[col].astype(str)
|
58 |
+
le = LabelEncoder()
|
59 |
+
df[col] = le.fit_transform(df[col])
|
60 |
+
label_encoders[col] = le
|
61 |
+
|
62 |
+
df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d', errors='coerce')
|
63 |
+
df = df.dropna(subset=['tourney_date'])
|
64 |
+
|
65 |
+
return df, label_encoders
|
66 |
+
|
67 |
+
def engineer_features(df):
|
68 |
+
numeric_cols = ['winner_age', 'loser_age', 'winner_rank', 'loser_rank']
|
69 |
+
for col in numeric_cols:
|
70 |
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
71 |
+
|
72 |
+
df['age_difference'] = df['winner_age'] - df['loser_age']
|
73 |
+
df['rank_difference'] = df['loser_rank'] - df['winner_rank']
|
74 |
+
|
75 |
+
numeric_columns = numeric_cols + ['age_difference', 'rank_difference']
|
76 |
+
df = df.dropna(subset=numeric_columns)
|
77 |
+
|
78 |
+
return df, numeric_columns
|
79 |
+
|
80 |
+
class JointEmbeddedModel(nn.Module):
|
81 |
+
def __init__(self, categorical_dims, numerical_dim, embedding_dim, hidden_dim, dropout_rate=0.3):
|
82 |
+
super().__init__()
|
83 |
+
self.embeddings = nn.ModuleList([nn.Embedding(dim, embedding_dim) for dim in categorical_dims])
|
84 |
+
self.fc1 = nn.Linear(len(categorical_dims) * embedding_dim + numerical_dim, hidden_dim)
|
85 |
+
self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
|
86 |
+
self.fc3 = nn.Linear(hidden_dim // 2, 1)
|
87 |
+
self.relu = nn.ReLU()
|
88 |
+
self.dropout = nn.Dropout(dropout_rate)
|
89 |
+
|
90 |
+
def forward(self, x_cat, x_num):
|
91 |
+
embedded = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
|
92 |
+
x = torch.cat(embedded + [x_num], dim=1)
|
93 |
+
x = self.dropout(self.relu(self.fc1(x)))
|
94 |
+
x = self.dropout(self.relu(self.fc2(x)))
|
95 |
+
return self.fc3(x).squeeze()
|
96 |
+
|
97 |
+
def create_dataloader(X, y, batch_size=64):
|
98 |
+
x_cat, x_num = X
|
99 |
+
# Ensure tensors are not empty
|
100 |
+
if len(x_cat) == 0 or len(x_num) == 0:
|
101 |
+
raise ValueError("Input data for dataloader is empty.")
|
102 |
+
dataset = TensorDataset(torch.tensor(x_cat, dtype=torch.long),
|
103 |
+
torch.tensor(x_num, dtype=torch.float32),
|
104 |
+
torch.tensor(y, dtype=torch.float32))
|
105 |
+
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
106 |
+
|
107 |
+
def train_model(model, dataloader, val_data, epochs=20, learning_rate=0.001, weight_decay=0, patience=5):
|
108 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
109 |
+
model.to(device)
|
110 |
+
criterion = nn.MSELoss()
|
111 |
+
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
|
112 |
+
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=patience, verbose=True)
|
113 |
+
scaler = GradScaler() if device.type == 'cuda' else None
|
114 |
+
|
115 |
+
best_val_loss = float('inf')
|
116 |
+
early_stopping_counter = 0
|
117 |
+
|
118 |
+
for epoch in range(epochs):
|
119 |
+
model.train()
|
120 |
+
total_loss = 0
|
121 |
+
for x_cat, x_num, y in dataloader:
|
122 |
+
x_cat, x_num, y = x_cat.to(device), x_num.to(device), y.to(device)
|
123 |
+
optimizer.zero_grad()
|
124 |
+
if scaler:
|
125 |
+
with autocast(device_type='cuda'):
|
126 |
+
outputs = model(x_cat, x_num)
|
127 |
+
loss = criterion(outputs, y)
|
128 |
+
scaler.scale(loss).backward()
|
129 |
+
scaler.step(optimizer)
|
130 |
+
scaler.update()
|
131 |
+
else:
|
132 |
+
outputs = model(x_cat, x_num)
|
133 |
+
loss = criterion(outputs, y)
|
134 |
+
loss.backward()
|
135 |
+
optimizer.step()
|
136 |
+
total_loss += loss.item()
|
137 |
+
|
138 |
+
avg_loss = total_loss / len(dataloader)
|
139 |
+
val_predictions = evaluate_model(model, val_data[0])
|
140 |
+
val_loss = np.mean((val_predictions - val_data[1]) ** 2)
|
141 |
+
scheduler.step(val_loss)
|
142 |
+
logging.info(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_loss:.4f}, Val Loss: {val_loss:.4f}")
|
143 |
+
|
144 |
+
if val_loss < best_val_loss:
|
145 |
+
best_val_loss = val_loss
|
146 |
+
early_stopping_counter = 0
|
147 |
+
torch.save(model.state_dict(), 'best_model.pt')
|
148 |
+
else:
|
149 |
+
early_stopping_counter += 1
|
150 |
+
if early_stopping_counter >= patience:
|
151 |
+
logging.info(f"Early stopping triggered after {epoch+1} epochs")
|
152 |
+
break
|
153 |
+
|
154 |
+
def evaluate_model(model, X):
|
155 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
156 |
+
model.eval()
|
157 |
+
x_cat, x_num = X
|
158 |
+
|
159 |
+
if len(x_cat.shape) == 1:
|
160 |
+
x_cat = x_cat.reshape(1, -1)
|
161 |
+
if len(x_num.shape) == 1:
|
162 |
+
x_num = x_num.reshape(1, -1)
|
163 |
+
|
164 |
+
x_cat = torch.tensor(x_cat, dtype=torch.long).to(device)
|
165 |
+
x_num = torch.tensor(x_num, dtype=torch.float32).to(device)
|
166 |
+
|
167 |
+
with torch.no_grad():
|
168 |
+
outputs = model(x_cat, x_num)
|
169 |
+
return outputs.cpu().numpy()
|
170 |
+
|
171 |
+
def objective(trial):
|
172 |
+
embedding_dim = trial.suggest_int('embedding_dim', 8, 64)
|
173 |
+
hidden_dim = trial.suggest_int('hidden_dim', 32, 256)
|
174 |
+
learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
|
175 |
+
batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
|
176 |
+
weight_decay = trial.suggest_float('weight_decay', 1e-8, 1e-3, log=True)
|
177 |
+
dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
|
178 |
+
|
179 |
+
model = JointEmbeddedModel(categorical_dims, numerical_dim, embedding_dim, hidden_dim, dropout_rate)
|
180 |
+
dataloader = create_dataloader(X_train, y_train, batch_size=batch_size)
|
181 |
+
train_model(model, dataloader, (X_val, y_val), epochs=10, learning_rate=learning_rate, weight_decay=weight_decay)
|
182 |
+
|
183 |
+
val_predictions = evaluate_model(model, X_val)
|
184 |
+
val_loss = np.mean((val_predictions - y_val) ** 2)
|
185 |
+
return val_loss
|
186 |
+
|
187 |
+
def enhanced_anomaly_detection(model, X, df_subset, eps=0.5, min_samples=5, threshold=None):
|
188 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
189 |
+
model.eval()
|
190 |
+
x_cat, x_num = X
|
191 |
+
if len(x_cat.shape) == 1:
|
192 |
+
x_cat = x_cat.reshape(-1, len(categorical_columns))
|
193 |
+
if len(x_num.shape) == 1:
|
194 |
+
x_num = x_num.reshape(-1, len(numeric_columns))
|
195 |
+
|
196 |
+
x_cat = torch.tensor(x_cat, dtype=torch.long).to(device)
|
197 |
+
x_num = torch.tensor(x_num, dtype=torch.float32).to(device)
|
198 |
+
with torch.no_grad():
|
199 |
+
embedded = [emb(x_cat[:, i]) for i, emb in enumerate(model.embeddings)]
|
200 |
+
embeddings = torch.cat(embedded, dim=1).cpu().numpy()
|
201 |
+
outputs = model(x_cat, x_num).cpu().numpy()
|
202 |
+
|
203 |
+
scaler = StandardScaler()
|
204 |
+
embeddings = scaler.fit_transform(embeddings)
|
205 |
+
|
206 |
+
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
|
207 |
+
labels = dbscan.fit_predict(embeddings)
|
208 |
+
|
209 |
+
df_subset['anomaly'] = labels
|
210 |
+
df_subset['expected_rank_difference'] = outputs
|
211 |
+
|
212 |
+
if threshold is None:
|
213 |
+
threshold = np.std(df_subset['rank_difference'] - df_subset['expected_rank_difference']) * 2
|
214 |
+
|
215 |
+
df_subset['positive_anomaly'] = (df_subset['rank_difference'] - df_subset['expected_rank_difference']) > threshold
|
216 |
+
df_subset['negative_anomaly'] = (df_subset['expected_rank_difference'] - df_subset['rank_difference']) > threshold
|
217 |
+
|
218 |
+
anomalies = df_subset[(df_subset['positive_anomaly']) | (df_subset['negative_anomaly'])]
|
219 |
+
|
220 |
+
positive_anomalies = anomalies[anomalies['positive_anomaly']]
|
221 |
+
negative_anomalies = anomalies[anomalies['negative_anomaly']]
|
222 |
+
|
223 |
+
logging.info(f"Positive Anomalies: {len(positive_anomalies)}")
|
224 |
+
logging.info(f"Negative Anomalies: {len(negative_anomalies)}")
|
225 |
+
|
226 |
+
# Count positive and negative anomalies per player, year, and tournament
|
227 |
+
player_positive_anomalies = pd.concat([
|
228 |
+
positive_anomalies['winner_name'],
|
229 |
+
positive_anomalies['loser_name']
|
230 |
+
]).value_counts()
|
231 |
+
|
232 |
+
player_negative_anomalies = pd.concat([
|
233 |
+
negative_anomalies['winner_name'],
|
234 |
+
negative_anomalies['loser_name']
|
235 |
+
]).value_counts()
|
236 |
+
|
237 |
+
year_anomalies = anomalies['tourney_date'].dt.year.value_counts()
|
238 |
+
tournament_anomalies = anomalies['tourney_id'].value_counts()
|
239 |
+
|
240 |
+
# Save player anomalies counts to CSV
|
241 |
+
player_positive_anomalies.to_csv('players_with_most_positive_anomalies.csv', header=['positive_anomalies'])
|
242 |
+
player_negative_anomalies.to_csv('players_with_most_negative_anomalies.csv', header=['negative_anomalies'])
|
243 |
+
year_anomalies.to_csv('years_with_most_anomalies.csv', header=['anomalies'])
|
244 |
+
tournament_anomalies.to_csv('tournaments_with_most_anomalies.csv', header=['anomalies'])
|
245 |
+
|
246 |
+
# Plotting DBSCAN results
|
247 |
+
plt.figure(figsize=(10, 6))
|
248 |
+
reduced_embeddings = TSNE(n_components=2).fit_transform(embeddings)
|
249 |
+
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='viridis', alpha=0.7)
|
250 |
+
plt.colorbar(label='Cluster Labels (Anomalies in -1)')
|
251 |
+
plt.title('DBSCAN Clustering of Embeddings for Anomaly Detection')
|
252 |
+
plt.xlabel('Component 1')
|
253 |
+
plt.ylabel('Component 2')
|
254 |
+
plt.savefig('anomaly_detection_plot.png')
|
255 |
+
plt.close()
|
256 |
+
|
257 |
+
return anomalies
|
258 |
+
|
259 |
+
if __name__ == "__main__":
|
260 |
+
try:
|
261 |
+
df = load_data()
|
262 |
+
df, label_encoders = preprocess_data(df)
|
263 |
+
df, numeric_columns = engineer_features(df)
|
264 |
+
|
265 |
+
categorical_columns = ['tourney_id', 'surface', 'winner_id', 'loser_id']
|
266 |
+
X_cat = df[categorical_columns].values
|
267 |
+
X_num = df[numeric_columns].values
|
268 |
+
y = df['rank_difference'].values
|
269 |
+
|
270 |
+
X_cat_train, X_cat_test, X_num_train, X_num_test, y_train, y_test, train_indices, test_indices = train_test_split(
|
271 |
+
X_cat, X_num, y, df.index, test_size=TEST_SIZE, random_state=RANDOM_SEED)
|
272 |
+
|
273 |
+
categorical_dims = [len(label_encoders[col].classes_) for col in categorical_columns]
|
274 |
+
numerical_dim = len(numeric_columns)
|
275 |
+
|
276 |
+
X_train = (X_cat_train, X_num_train)
|
277 |
+
X_val = (X_cat_test[:VALIDATION_SIZE], X_num_test[:VALIDATION_SIZE])
|
278 |
+
y_val = y_test[:VALIDATION_SIZE]
|
279 |
+
|
280 |
+
study = optuna.create_study(direction='minimize')
|
281 |
+
study.optimize(objective, n_trials=20)
|
282 |
+
|
283 |
+
best_params = study.best_params
|
284 |
+
logging.info(f"Best Hyperparameters: {best_params}")
|
285 |
+
|
286 |
+
model = JointEmbeddedModel(categorical_dims, numerical_dim, best_params['embedding_dim'],
|
287 |
+
best_params['hidden_dim'], best_params['dropout_rate'])
|
288 |
+
dataloader = create_dataloader(X_train, y_train, batch_size=best_params['batch_size'])
|
289 |
+
train_model(model, dataloader, (X_val, y_val), epochs=20, learning_rate=best_params['learning_rate'],
|
290 |
+
weight_decay=best_params['weight_decay'])
|
291 |
+
|
292 |
+
model.load_state_dict(torch.load('best_model.pt'))
|
293 |
+
test_predictions = evaluate_model(model, (X_cat_test, X_num_test))
|
294 |
+
test_mse = np.mean((test_predictions - y_test) ** 2)
|
295 |
+
logging.info(f"Final Test MSE: {test_mse}")
|
296 |
+
|
297 |
+
anomalies = enhanced_anomaly_detection(model, (X_cat_test, X_num_test), df.loc[test_indices])
|
298 |
+
|
299 |
+
# Save test predictions
|
300 |
+
np.save('test_predictions.npy', test_predictions)
|
301 |
+
|
302 |
+
# Save anomalies to CSV
|
303 |
+
anomalies.to_csv('anomalies.csv', index=False)
|
304 |
+
|
305 |
+
logging.info("Test predictions and anomalies saved successfully.")
|
306 |
+
|
307 |
+
torch.save(model.state_dict(), 'final_model.pt')
|
308 |
+
|
309 |
+
logging.info("Script execution completed successfully.")
|
310 |
+
except Exception as e:
|
311 |
+
logging.error(f"An error occurred during script execution: {str(e)}")
|