Spaces:
Sleeping
Sleeping
import torch | |
import torch.nn as nn | |
import torch.optim as optim | |
import torch.nn.init as init | |
from torch.utils.data import Dataset, DataLoader | |
import pandas as pd | |
import numpy as np | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.model_selection import train_test_split | |
import os | |
from datetime import datetime, timedelta | |
import argparse | |
import json | |
import matplotlib.pyplot as plt | |
# Vérifier si MPS est disponible | |
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") | |
print(f"Utilisation de l'appareil: {device}") | |
def load_brent_data(file_path): | |
print(f"Chargement des données Brent depuis {file_path}") | |
brent_data = pd.read_csv(file_path) | |
brent_data['brent_date'] = pd.to_datetime(brent_data['brent_date']) | |
# Filtrer les données à partir de 2024 | |
brent_data = brent_data[brent_data['brent_date'].dt.year >= 2024] | |
brent_data = brent_data.sort_values('brent_date') | |
print(f"Données Brent chargées, triées et filtrées à partir de 2024. Shape: {brent_data.shape}") | |
return brent_data | |
def load_fuel_data(folder_path): | |
print(f"Chargement des données de carburant depuis {folder_path}") | |
all_data = [] | |
for filename in os.listdir(folder_path): | |
if filename.endswith('.csv'): | |
file_path = os.path.join(folder_path, filename) | |
df = pd.read_csv(file_path) | |
df['rate_date'] = pd.to_datetime(df['rate_date']) | |
all_data.append(df) | |
fuel_data = pd.concat(all_data, ignore_index=True) | |
fuel_data = fuel_data[~fuel_data['fuel_name'].isin(['GPLc', 'E85'])] | |
# Filtrer les données à partir de 2024 | |
fuel_data = fuel_data[fuel_data['rate_date'].dt.year >= 2024] | |
print(f"Données de carburant chargées et filtrées à partir de 2024. Shape: {fuel_data.shape}") | |
return fuel_data | |
def classify_stations(fuel_data): | |
print("Classification des stations par gamme de prix") | |
station_classifications = {} | |
fuel_types = fuel_data['fuel_name'].unique() | |
for fuel_type in fuel_types: | |
fuel_type_data = fuel_data[fuel_data['fuel_name'] == fuel_type] | |
station_avg_prices = fuel_type_data.groupby('id')['price'].mean().reset_index() | |
thresholds = np.percentile(station_avg_prices['price'], [33, 66]) | |
def classify_price(price): | |
if price <= thresholds[0]: | |
return 'low-cost' | |
elif price <= thresholds[1]: | |
return 'normal' | |
else: | |
return 'premium' | |
station_classifications[fuel_type] = station_avg_prices.set_index('id')['price'].apply(classify_price).to_dict() | |
return station_classifications | |
def save_station_classifications(station_classifications, output_dir): | |
classification_df = pd.DataFrame(station_classifications) | |
classification_df.index.name = 'station_id' | |
classification_df.reset_index(inplace=True) | |
classification_file = os.path.join(output_dir, 'station_classifications.csv') | |
classification_df.to_csv(classification_file, index=False) | |
print(f"Classifications des stations sauvegardées dans {classification_file}") | |
class FuelPriceDataset(Dataset): | |
def __init__(self, data, sequence_length, target_days): | |
self.data = data | |
self.sequence_length = sequence_length | |
self.target_days = target_days | |
print(f"Shape of data in FuelPriceDataset: {self.data.shape}") | |
def __len__(self): | |
return len(self.data) - self.sequence_length - max(self.target_days) | |
def __getitem__(self, idx): | |
x = self.data.iloc[idx:idx+self.sequence_length].values | |
y = self.data.iloc[idx+self.sequence_length:idx+self.sequence_length+max(self.target_days)+1]['price'].values | |
y = [y[day] for day in self.target_days] | |
if idx == 0: # Print only for the first item | |
print(f"Sample input (X) at index 0:") | |
print(x) | |
print(f"Sample output (y) at index 0:") | |
print(y) | |
return torch.FloatTensor(x), torch.FloatTensor(y) | |
class LSTMModel(nn.Module): | |
def __init__(self, input_size, hidden_size, num_layers, output_size): | |
super(LSTMModel, self).__init__() | |
self.hidden_size = hidden_size | |
self.num_layers = num_layers | |
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True) | |
self.fc = nn.Linear(hidden_size, output_size) | |
# Initialisation des poids | |
for name, param in self.lstm.named_parameters(): | |
if 'weight' in name: | |
init.xavier_uniform_(param) | |
elif 'bias' in name: | |
init.constant_(param, 0.0) | |
init.xavier_uniform_(self.fc.weight) | |
init.constant_(self.fc.bias, 0.0) | |
def forward(self, x): | |
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) | |
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) | |
out, _ = self.lstm(x, (h0, c0)) | |
out = self.fc(out[:, -1, :]) | |
return out | |
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, patience, output_dir, fuel_type, price_range, scaler): | |
train_losses = [] | |
val_losses = [] | |
best_val_loss = float('inf') | |
epochs_no_improve = 0 | |
for epoch in range(num_epochs): | |
model.train() | |
train_loss = 0 | |
for batch_x, batch_y in train_loader: | |
batch_x, batch_y = batch_x.to(device), batch_y.to(device) | |
optimizer.zero_grad() | |
outputs = model(batch_x) | |
loss = criterion(outputs, batch_y) | |
loss.backward() | |
optimizer.step() | |
train_loss += loss.item() | |
model.eval() | |
val_loss = 0 | |
with torch.no_grad(): | |
for batch_x, batch_y in val_loader: | |
batch_x, batch_y = batch_x.to(device), batch_y.to(device) | |
outputs = model(batch_x) | |
loss = criterion(outputs, batch_y) | |
val_loss += loss.item() | |
train_loss /= len(train_loader) | |
val_loss /= len(val_loader) | |
train_losses.append(train_loss) | |
val_losses.append(val_loss) | |
print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}") | |
if val_loss < best_val_loss: | |
best_val_loss = val_loss | |
epochs_no_improve = 0 | |
# Sauvegarder le meilleur modèle | |
torch.save(model.state_dict(), os.path.join(output_dir, f'best_model_{fuel_type}_{price_range}.pth')) | |
else: | |
epochs_no_improve += 1 | |
if epochs_no_improve == patience: | |
print(f"Early stopping triggered after {epoch + 1} epochs") | |
break | |
# Charger le meilleur modèle avant de faire les prédictions finales | |
model.load_state_dict(torch.load(os.path.join(output_dir, f'best_model_{fuel_type}_{price_range}.pth'))) | |
# Générer le graphique et calculer les métriques | |
mse, mae, r2 = plot_predictions_vs_actual(model, val_loader, scaler, output_dir, fuel_type, price_range) | |
return train_losses, val_losses, mse, mae, r2 | |
def plot_learning_curves(train_losses, val_losses, output_dir, fuel_type, price_range): | |
plt.figure(figsize=(10, 6)) | |
plt.plot(train_losses, label='Train Loss') | |
plt.plot(val_losses, label='Validation Loss') | |
plt.title(f'Learning Curves - {fuel_type} - {price_range}') | |
plt.xlabel('Epochs') | |
plt.ylabel('Loss') | |
plt.legend() | |
plt.grid(True) | |
plt.tight_layout() | |
plt.savefig(os.path.join(output_dir, f'learning_curves_{fuel_type}_{price_range}.png')) | |
plt.close() | |
def plot_predictions_vs_actual(model, val_loader, scaler, output_dir, fuel_type, price_range): | |
model.eval() | |
predictions = [] | |
actual_values = [] | |
with torch.no_grad(): | |
for batch_x, batch_y in val_loader: | |
batch_x = batch_x.to(device) | |
outputs = model(batch_x) | |
predictions.extend(outputs.cpu().numpy()) | |
actual_values.extend(batch_y.numpy()) | |
predictions = np.array(predictions) | |
actual_values = np.array(actual_values) | |
plt.figure(figsize=(12, 6)) | |
plt.scatter(actual_values[:, 0], predictions[:, 0], alpha=0.5) | |
plt.plot([actual_values[:, 0].min(), actual_values[:, 0].max()], | |
[actual_values[:, 0].min(), actual_values[:, 0].max()], | |
'r--', lw=2) | |
plt.xlabel('Valeurs réelles') | |
plt.ylabel('Prédictions') | |
plt.title(f'Prédictions vs Valeurs réelles - {fuel_type} - {price_range}') | |
plt.tight_layout() | |
plt.savefig(os.path.join(output_dir, f'predictions_vs_actual_{fuel_type}_{price_range}.png')) | |
plt.close() | |
# Calcul des métriques | |
mse = np.mean((predictions[:, 0] - actual_values[:, 0])**2) | |
mae = np.mean(np.abs(predictions[:, 0] - actual_values[:, 0])) | |
r2 = 1 - (np.sum((actual_values[:, 0] - predictions[:, 0])**2) / | |
np.sum((actual_values[:, 0] - np.mean(actual_values[:, 0]))**2)) | |
print(f"MSE: {mse:.4f}") | |
print(f"MAE: {mae:.4f}") | |
print(f"R2 Score: {r2:.4f}") | |
return mse, mae, r2 | |
def prepare_data_for_fuel_type_and_range(merged_data, fuel_type, price_range, station_classifications, sequence_length, target_days): | |
print(f"Préparation des données pour {fuel_type} - {price_range}") | |
stations_in_range = [station for station, range_ in station_classifications[fuel_type].items() if range_ == price_range] | |
fuel_data = merged_data[(merged_data['fuel_name'] == fuel_type) & (merged_data['id'].isin(stations_in_range))].copy() | |
# Traitement des variables temporelles | |
fuel_data['day_of_week'] = fuel_data['rate_date'].dt.dayofweek | |
fuel_data['month'] = fuel_data['rate_date'].dt.month | |
# Encodage cyclique pour le jour de la semaine et le mois | |
fuel_data['day_of_week_sin'] = np.sin(2 * np.pi * fuel_data['day_of_week'] / 7) | |
fuel_data['day_of_week_cos'] = np.cos(2 * np.pi * fuel_data['day_of_week'] / 7) | |
fuel_data['month_sin'] = np.sin(2 * np.pi * fuel_data['month'] / 12) | |
fuel_data['month_cos'] = np.cos(2 * np.pi * fuel_data['month'] / 12) | |
# Standardisation du prix du Brent (au lieu de normaliser) | |
scaler = StandardScaler() | |
fuel_data['brent_rate_eur_scaled'] = scaler.fit_transform(fuel_data[['brent_rate_eur']]) | |
# Sélection des colonnes finales | |
columns_to_use = ['price', 'brent_rate_eur_scaled', 'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos'] | |
fuel_data_prepared = fuel_data[columns_to_use] | |
print("Statistiques des données préparées:") | |
print(fuel_data_prepared.describe()) | |
print("\nNombre de valeurs uniques par colonne:") | |
print(fuel_data_prepared.nunique()) | |
print("\nVérification des valeurs nulles:") | |
print(fuel_data_prepared.isnull().sum()) | |
dataset = FuelPriceDataset(fuel_data_prepared, sequence_length, target_days) | |
train_size = int(0.8 * len(dataset)) | |
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size]) | |
return train_dataset, val_dataset, scaler | |
def main(args): | |
print("Début du processus principal") | |
brent_data = load_brent_data(args.brent_data) | |
fuel_data = load_fuel_data(args.fuel_data) | |
print("Fusion des données Brent et carburant") | |
merged_data = pd.merge_asof(fuel_data.sort_values('rate_date'), | |
brent_data.reset_index().sort_values('brent_date'), | |
left_on='rate_date', | |
right_on='brent_date', | |
direction='nearest') | |
print(f"Données fusionnées. Shape: {merged_data.shape}") | |
station_classifications = classify_stations(fuel_data) | |
save_station_classifications(station_classifications, args.output_dir) | |
price_ranges = ['low-cost', 'normal', 'premium'] | |
fuel_types = merged_data['fuel_name'].unique() | |
for fuel_type in fuel_types: | |
for price_range in price_ranges: | |
print(f"\nTraitement de {fuel_type} - {price_range}") | |
output_dir = os.path.join(args.output_dir, fuel_type, price_range) | |
os.makedirs(output_dir, exist_ok=True) | |
try: | |
train_dataset, val_dataset, scaler = prepare_data_for_fuel_type_and_range( | |
merged_data, fuel_type, price_range, station_classifications, args.sequence_length, args.target_days | |
) | |
if len(train_dataset) < args.min_train_samples: | |
print(f"Pas assez de données pour {fuel_type} - {price_range}. Ignoré.") | |
continue | |
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) | |
val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) | |
print(f"Taille du dataset d'entraînement : {len(train_dataset)}") | |
print(f"Taille du dataset de validation : {len(val_dataset)}") | |
print(f"Nombre de batchs d'entraînement : {len(train_loader)}") | |
print(f"Nombre de batchs de validation : {len(val_loader)}") | |
sample_x, sample_y = next(iter(train_loader)) | |
input_size = sample_x.shape[2] | |
model = LSTMModel(input_size, args.hidden_size, args.num_layers, len(args.target_days)).to(device) | |
criterion = nn.MSELoss() | |
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) | |
train_losses, val_losses, mse, mae, r2 = train_model( | |
model, train_loader, val_loader, criterion, optimizer, | |
args.num_epochs, args.patience, output_dir, fuel_type, price_range, scaler | |
) | |
# Sauvegarder le modèle final | |
model_filename = os.path.join(output_dir, f'final_model_{fuel_type}_{price_range}.pth') | |
torch.save(model.state_dict(), model_filename) | |
# Sauvegarder le scaler | |
scaler_filename = os.path.join(output_dir, f'scaler_{fuel_type}_{price_range}.pkl') | |
pd.to_pickle(scaler, scaler_filename) | |
# Sauvegarder les paramètres du modèle | |
params = { | |
'input_size': input_size, | |
'hidden_size': args.hidden_size, | |
'num_layers': args.num_layers, | |
'output_size': len(args.target_days), | |
'sequence_length': args.sequence_length, | |
'target_days': args.target_days | |
} | |
params_filename = os.path.join(output_dir, f'model_params_{fuel_type}_{price_range}.json') | |
with open(params_filename, 'w') as f: | |
json.dump(params, f) | |
# Sauvegarder les métriques | |
metrics = { | |
'mse': mse, | |
'mae': mae, | |
'r2': r2 | |
} | |
metrics_filename = os.path.join(output_dir, f'metrics_{fuel_type}_{price_range}.json') | |
with open(metrics_filename, 'w') as f: | |
json.dump(metrics, f) | |
# Tracer et sauvegarder les courbes d'apprentissage | |
plot_learning_curves(train_losses, val_losses, output_dir, fuel_type, price_range) | |
print(f"Modèle, paramètres, métriques et graphiques pour {fuel_type} - {price_range} sauvegardés dans {output_dir}") | |
except Exception as e: | |
print(f"Erreur lors du traitement de {fuel_type} - {price_range}: {str(e)}") | |
print("Processus terminé pour tous les types de carburant et gammes de prix.") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Entraînement du modèle de prédiction des prix du carburant") | |
parser.add_argument("--brent_data", type=str, required=True, help="Chemin vers le fichier de données Brent") | |
parser.add_argument("--fuel_data", type=str, required=True, help="Chemin vers le dossier contenant les données de carburant") | |
parser.add_argument("--output_dir", type=str, default="./output", help="Dossier de sortie pour les modèles et les paramètres") | |
parser.add_argument("--hidden_size", type=int, default=64, help="Taille de la couche cachée LSTM") | |
parser.add_argument("--num_layers", type=int, default=2, help="Nombre de couches LSTM") | |
parser.add_argument("--sequence_length", type=int, default=30, help="Longueur de la séquence d'entrée") | |
parser.add_argument("--target_days", nargs='+', type=int, default=[3, 7, 15, 30], help="Jours cibles pour la prédiction") | |
parser.add_argument("--batch_size", type=int, default=32, help="Taille du batch pour l'entraînement") | |
parser.add_argument("--num_epochs", type=int, default=50, help="Nombre d'époques d'entraînement") | |
parser.add_argument("--learning_rate", type=float, default=0.001, help="Taux d'apprentissage") | |
parser.add_argument("--min_train_samples", type=int, default=50, help="Nombre minimum d'échantillons d'entraînement") | |
parser.add_argument("--patience", type=int, default=5, help="Nombre d'époques sans amélioration avant l'arrêt précoce") | |
args = parser.parse_args() | |
print(f"Arguments reçus: {args}") | |
os.makedirs(args.output_dir, exist_ok=True) | |
print(f"Dossier de sortie principal créé/vérifié: {args.output_dir}") | |
main(args) |