|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
!pip install -q ta
|
|
|
|
|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
import torch.nn.functional as F
|
|
|
import torch.optim as optim
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import warnings
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
print("="*70)
|
|
|
print(" PYTORCH GPU SETUP (30GB GPU)")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
if torch.cuda.is_available():
|
|
|
|
|
|
gpu_name = torch.cuda.get_device_name(0)
|
|
|
gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
|
|
|
|
|
|
print(f"✅ GPU: {gpu_name}")
|
|
|
print(f"✅ GPU Memory: {gpu_mem:.1f} GB")
|
|
|
|
|
|
|
|
|
torch.backends.cuda.matmul.allow_tf32 = True
|
|
|
torch.backends.cudnn.allow_tf32 = True
|
|
|
print("✅ TF32: Enabled (2-3x speedup on Ampere)")
|
|
|
|
|
|
|
|
|
torch.backends.cudnn.benchmark = True
|
|
|
print("✅ cuDNN benchmark: Enabled")
|
|
|
|
|
|
|
|
|
torch.set_default_device('cuda')
|
|
|
print("✅ Default device: CUDA")
|
|
|
|
|
|
else:
|
|
|
print("⚠️ No GPU detected, using CPU")
|
|
|
|
|
|
print(f"\n✅ PyTorch: {torch.__version__}")
|
|
|
print(f"✅ Device: {device}")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import gym
|
|
|
from gym import spaces
|
|
|
from ta.momentum import RSIIndicator, StochasticOscillator, ROCIndicator, WilliamsRIndicator
|
|
|
from ta.trend import MACD, EMAIndicator, SMAIndicator, ADXIndicator, CCIIndicator
|
|
|
from ta.volatility import BollingerBands, AverageTrueRange
|
|
|
from ta.volume import OnBalanceVolumeIndicator
|
|
|
import os
|
|
|
|
|
|
print("="*70)
|
|
|
print(" LOADING MULTI-TIMEFRAME DATA + FEATURES")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_indicators(df, suffix=''):
|
|
|
"""Calculate all technical indicators for a given dataframe"""
|
|
|
data = df.copy()
|
|
|
s = f'_{suffix}' if suffix else ''
|
|
|
|
|
|
|
|
|
data[f'rsi_14{s}'] = RSIIndicator(close=data['close'], window=14).rsi() / 100
|
|
|
data[f'rsi_7{s}'] = RSIIndicator(close=data['close'], window=7).rsi() / 100
|
|
|
|
|
|
stoch = StochasticOscillator(high=data['high'], low=data['low'], close=data['close'], window=14)
|
|
|
data[f'stoch_k{s}'] = stoch.stoch() / 100
|
|
|
data[f'stoch_d{s}'] = stoch.stoch_signal() / 100
|
|
|
|
|
|
roc = ROCIndicator(close=data['close'], window=12)
|
|
|
data[f'roc_12{s}'] = np.tanh(roc.roc() / 100)
|
|
|
|
|
|
williams = WilliamsRIndicator(high=data['high'], low=data['low'], close=data['close'], lbp=14)
|
|
|
data[f'williams_r{s}'] = (williams.williams_r() + 100) / 100
|
|
|
|
|
|
macd = MACD(close=data['close'])
|
|
|
data[f'macd{s}'] = np.tanh(macd.macd() / data['close'] * 100)
|
|
|
data[f'macd_signal{s}'] = np.tanh(macd.macd_signal() / data['close'] * 100)
|
|
|
data[f'macd_diff{s}'] = np.tanh(macd.macd_diff() / data['close'] * 100)
|
|
|
|
|
|
|
|
|
data[f'sma_20{s}'] = SMAIndicator(close=data['close'], window=20).sma_indicator()
|
|
|
data[f'sma_50{s}'] = SMAIndicator(close=data['close'], window=50).sma_indicator()
|
|
|
data[f'ema_12{s}'] = EMAIndicator(close=data['close'], window=12).ema_indicator()
|
|
|
data[f'ema_26{s}'] = EMAIndicator(close=data['close'], window=26).ema_indicator()
|
|
|
|
|
|
data[f'price_vs_sma20{s}'] = (data['close'] - data[f'sma_20{s}']) / data[f'sma_20{s}']
|
|
|
data[f'price_vs_sma50{s}'] = (data['close'] - data[f'sma_50{s}']) / data[f'sma_50{s}']
|
|
|
|
|
|
adx = ADXIndicator(high=data['high'], low=data['low'], close=data['close'], window=14)
|
|
|
data[f'adx{s}'] = adx.adx() / 100
|
|
|
data[f'adx_pos{s}'] = adx.adx_pos() / 100
|
|
|
data[f'adx_neg{s}'] = adx.adx_neg() / 100
|
|
|
|
|
|
cci = CCIIndicator(high=data['high'], low=data['low'], close=data['close'], window=20)
|
|
|
data[f'cci{s}'] = np.tanh(cci.cci() / 100)
|
|
|
|
|
|
|
|
|
bb = BollingerBands(close=data['close'], window=20, window_dev=2)
|
|
|
data[f'bb_width{s}'] = (bb.bollinger_hband() - bb.bollinger_lband()) / bb.bollinger_mavg()
|
|
|
data[f'bb_position{s}'] = (data['close'] - bb.bollinger_lband()) / (bb.bollinger_hband() - bb.bollinger_lband())
|
|
|
|
|
|
atr = AverageTrueRange(high=data['high'], low=data['low'], close=data['close'], window=14)
|
|
|
data[f'atr_percent{s}'] = atr.average_true_range() / data['close']
|
|
|
|
|
|
|
|
|
data[f'volume_ma_20{s}'] = data['volume'].rolling(20).mean()
|
|
|
data[f'volume_ratio{s}'] = data['volume'] / (data[f'volume_ma_20{s}'] + 1e-8)
|
|
|
|
|
|
obv = OnBalanceVolumeIndicator(close=data['close'], volume=data['volume'])
|
|
|
data[f'obv_slope{s}'] = (obv.on_balance_volume().diff(5) / (obv.on_balance_volume().shift(5).abs() + 1e-8))
|
|
|
|
|
|
|
|
|
data[f'returns_1{s}'] = data['close'].pct_change()
|
|
|
data[f'returns_5{s}'] = data['close'].pct_change(5)
|
|
|
data[f'returns_20{s}'] = data['close'].pct_change(20)
|
|
|
data[f'volatility_20{s}'] = data[f'returns_1{s}'].rolling(20).std()
|
|
|
|
|
|
data[f'body_size{s}'] = abs(data['close'] - data['open']) / (data['open'] + 1e-8)
|
|
|
data[f'high_20{s}'] = data['high'].rolling(20).max()
|
|
|
data[f'low_20{s}'] = data['low'].rolling(20).min()
|
|
|
data[f'price_position{s}'] = (data['close'] - data[f'low_20{s}']) / (data[f'high_20{s}'] - data[f'low_20{s}'] + 1e-8)
|
|
|
|
|
|
|
|
|
cols_to_drop = [c for c in [f'sma_20{s}', f'sma_50{s}', f'ema_12{s}', f'ema_26{s}',
|
|
|
f'volume_ma_20{s}', f'high_20{s}', f'low_20{s}'] if c in data.columns]
|
|
|
data = data.drop(columns=cols_to_drop)
|
|
|
|
|
|
return data
|
|
|
|
|
|
def load_and_clean_btc(filepath):
|
|
|
"""Load and clean BTC data from CSV"""
|
|
|
df = pd.read_csv(filepath)
|
|
|
column_mapping = {'Open time': 'timestamp', 'Open': 'open', 'High': 'high',
|
|
|
'Low': 'low', 'Close': 'close', 'Volume': 'volume'}
|
|
|
df = df.rename(columns=column_mapping)
|
|
|
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
|
|
df.set_index('timestamp', inplace=True)
|
|
|
df = df[['open', 'high', 'low', 'close', 'volume']]
|
|
|
|
|
|
for col in df.columns:
|
|
|
df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
|
|
|
|
df = df[df.index >= '2021-01-01']
|
|
|
df = df[~df.index.duplicated(keep='first')]
|
|
|
df = df.replace(0, np.nan).dropna().sort_index()
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_path = '/kaggle/input/bitcoin-historical-datasets-2018-2024/'
|
|
|
|
|
|
print("📊 Loading 15-minute data...")
|
|
|
btc_15m = load_and_clean_btc(data_path + 'btc_15m_data_2018_to_2025.csv')
|
|
|
print(f" ✅ 15m: {len(btc_15m):,} candles")
|
|
|
|
|
|
print("📊 Loading 1-hour data...")
|
|
|
btc_1h = load_and_clean_btc(data_path + 'btc_1h_data_2018_to_2025.csv')
|
|
|
print(f" ✅ 1h: {len(btc_1h):,} candles")
|
|
|
|
|
|
print("📊 Loading 4-hour data...")
|
|
|
btc_4h = load_and_clean_btc(data_path + 'btc_4h_data_2018_to_2025.csv')
|
|
|
print(f" ✅ 4h: {len(btc_4h):,} candles")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fgi_loaded = False
|
|
|
|
|
|
try:
|
|
|
fgi_path = '/kaggle/input/btc-usdt-4h-ohlc-fgi-daily-2020/'
|
|
|
files = os.listdir(fgi_path)
|
|
|
|
|
|
for filename in files:
|
|
|
if filename.endswith('.csv'):
|
|
|
fgi_data = pd.read_csv(fgi_path + filename)
|
|
|
|
|
|
time_col = [c for c in fgi_data.columns if 'time' in c.lower() or 'date' in c.lower()]
|
|
|
if time_col:
|
|
|
fgi_data['timestamp'] = pd.to_datetime(fgi_data[time_col[0]])
|
|
|
else:
|
|
|
fgi_data['timestamp'] = pd.to_datetime(fgi_data.iloc[:, 0])
|
|
|
|
|
|
fgi_data.set_index('timestamp', inplace=True)
|
|
|
|
|
|
fgi_col = [c for c in fgi_data.columns if 'fgi' in c.lower() or 'fear' in c.lower() or 'greed' in c.lower()]
|
|
|
if fgi_col:
|
|
|
fgi_data = fgi_data[[fgi_col[0]]].rename(columns={fgi_col[0]: 'fgi'})
|
|
|
fgi_loaded = True
|
|
|
print(f"✅ Fear & Greed loaded: {len(fgi_data):,} values")
|
|
|
break
|
|
|
except:
|
|
|
pass
|
|
|
|
|
|
if not fgi_loaded:
|
|
|
fgi_data = pd.DataFrame(index=btc_15m.index)
|
|
|
fgi_data['fgi'] = 50
|
|
|
print("⚠️ Using neutral FGI values")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n🔧 Calculating indicators for 15m...")
|
|
|
data_15m = calculate_indicators(btc_15m, suffix='15m')
|
|
|
|
|
|
print("🔧 Calculating indicators for 1h...")
|
|
|
data_1h = calculate_indicators(btc_1h, suffix='1h')
|
|
|
|
|
|
print("🔧 Calculating indicators for 4h...")
|
|
|
data_4h = calculate_indicators(btc_4h, suffix='4h')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n🔗 Merging timeframes...")
|
|
|
|
|
|
cols_1h = [c for c in data_1h.columns if c not in ['open', 'high', 'low', 'close', 'volume']]
|
|
|
cols_4h = [c for c in data_4h.columns if c not in ['open', 'high', 'low', 'close', 'volume']]
|
|
|
|
|
|
data = data_15m.copy()
|
|
|
data = data.join(data_1h[cols_1h], how='left')
|
|
|
data = data.join(data_4h[cols_4h], how='left')
|
|
|
|
|
|
for col in cols_1h + cols_4h:
|
|
|
data[col] = data[col].fillna(method='ffill')
|
|
|
|
|
|
|
|
|
data = data.join(fgi_data, how='left')
|
|
|
data['fgi'] = data['fgi'].fillna(method='ffill').fillna(method='bfill').fillna(50)
|
|
|
|
|
|
|
|
|
data['fgi_normalized'] = (data['fgi'] - 50) / 50
|
|
|
data['fgi_change'] = data['fgi'].diff() / 50
|
|
|
data['fgi_ma7'] = data['fgi'].rolling(7).mean()
|
|
|
data['fgi_vs_ma'] = (data['fgi'] - data['fgi_ma7']) / 50
|
|
|
|
|
|
|
|
|
data['hour'] = data.index.hour / 24
|
|
|
data['day_of_week'] = data.index.dayofweek / 7
|
|
|
data['us_session'] = ((data.index.hour >= 14) & (data.index.hour < 21)).astype(float)
|
|
|
|
|
|
btc_features = data.dropna()
|
|
|
|
|
|
feature_cols = [col for col in btc_features.columns
|
|
|
if col not in ['open', 'high', 'low', 'close', 'volume', 'fgi', 'fgi_ma7']]
|
|
|
|
|
|
print(f"\n✅ Multi-timeframe features complete!")
|
|
|
print(f" 15m features: {len([c for c in feature_cols if '15m' in c])}")
|
|
|
print(f" 1h features: {len([c for c in feature_cols if '1h' in c])}")
|
|
|
print(f" 4h features: {len([c for c in feature_cols if '4h' in c])}")
|
|
|
print(f" Other features: {len([c for c in feature_cols if '15m' not in c and '1h' not in c and '4h' not in c])}")
|
|
|
print(f" TOTAL features: {len(feature_cols)}")
|
|
|
print(f" Clean data: {len(btc_features):,} candles")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n📊 Creating Data Splits...")
|
|
|
|
|
|
train_size = int(len(btc_features) * 0.70)
|
|
|
valid_size = int(len(btc_features) * 0.15)
|
|
|
|
|
|
train_data = btc_features.iloc[:train_size].copy()
|
|
|
valid_data = btc_features.iloc[train_size:train_size+valid_size].copy()
|
|
|
test_data = btc_features.iloc[train_size+valid_size:].copy()
|
|
|
|
|
|
print(f" Train: {len(train_data):,} | Valid: {len(valid_data):,} | Test: {len(test_data):,}")
|
|
|
|
|
|
|
|
|
full_data = btc_features.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RollingNormalizer:
|
|
|
"""
|
|
|
Rolling z-score normalization to prevent look-ahead bias.
|
|
|
Uses a rolling window to calculate mean and std.
|
|
|
"""
|
|
|
def __init__(self, window_size=2880):
|
|
|
self.window_size = window_size
|
|
|
self.feature_cols = None
|
|
|
|
|
|
def fit_transform(self, df, feature_cols):
|
|
|
"""Apply rolling normalization to dataframe"""
|
|
|
self.feature_cols = feature_cols
|
|
|
result = df.copy()
|
|
|
|
|
|
for col in feature_cols:
|
|
|
rolling_mean = df[col].rolling(window=self.window_size, min_periods=100).mean()
|
|
|
rolling_std = df[col].rolling(window=self.window_size, min_periods=100).std()
|
|
|
result[col] = (df[col] - rolling_mean) / (rolling_std + 1e-8)
|
|
|
|
|
|
|
|
|
result[feature_cols] = result[feature_cols].clip(-5, 5)
|
|
|
|
|
|
|
|
|
result[feature_cols] = result[feature_cols].fillna(0)
|
|
|
|
|
|
return result
|
|
|
|
|
|
print("✅ RollingNormalizer class defined")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BitcoinTradingEnv(gym.Env):
|
|
|
"""
|
|
|
Trading environment with:
|
|
|
- Differential Sharpe Ratio (DSR) reward with warmup
|
|
|
- Previous action in state (to learn cost of switching)
|
|
|
- Transaction fee ramping (0 -> 0.1% after warmup)
|
|
|
- Random flip data augmentation (50% chance to invert market)
|
|
|
"""
|
|
|
|
|
|
def __init__(self, df, initial_balance=10000, episode_length=500,
|
|
|
base_transaction_fee=0.001,
|
|
|
dsr_eta=0.01):
|
|
|
super().__init__()
|
|
|
self.df = df.reset_index(drop=True)
|
|
|
self.initial_balance = initial_balance
|
|
|
self.episode_length = episode_length
|
|
|
self.base_transaction_fee = base_transaction_fee
|
|
|
self.dsr_eta = dsr_eta
|
|
|
|
|
|
|
|
|
self.fee_multiplier = 0.0
|
|
|
|
|
|
|
|
|
self.training_mode = True
|
|
|
self.flip_sign = 1.0
|
|
|
|
|
|
|
|
|
self.dsr_warmup_steps = 100
|
|
|
|
|
|
self.feature_cols = [col for col in df.columns
|
|
|
if col not in ['open', 'high', 'low', 'close', 'volume', 'fgi', 'fgi_ma7']]
|
|
|
|
|
|
self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)
|
|
|
|
|
|
self.observation_space = spaces.Box(
|
|
|
low=-10, high=10,
|
|
|
shape=(len(self.feature_cols) + 6,),
|
|
|
dtype=np.float32
|
|
|
)
|
|
|
self.reset()
|
|
|
|
|
|
def set_fee_multiplier(self, multiplier):
|
|
|
"""Set fee multiplier (0.0 to 1.0) for fee ramping"""
|
|
|
self.fee_multiplier = np.clip(multiplier, 0.0, 1.0)
|
|
|
|
|
|
def set_training_mode(self, training=True):
|
|
|
"""Set training mode (enables random flips for augmentation)"""
|
|
|
self.training_mode = training
|
|
|
|
|
|
@property
|
|
|
def current_fee(self):
|
|
|
"""Current transaction fee based on multiplier"""
|
|
|
return self.base_transaction_fee * self.fee_multiplier
|
|
|
|
|
|
def reset(self):
|
|
|
max_start = len(self.df) - self.episode_length - 1
|
|
|
self.start_idx = np.random.randint(100, max(101, max_start))
|
|
|
|
|
|
self.current_step = 0
|
|
|
self.balance = self.initial_balance
|
|
|
self.position = 0.0
|
|
|
self.entry_price = 0.0
|
|
|
self.total_value = self.initial_balance
|
|
|
self.prev_total_value = self.initial_balance
|
|
|
self.max_value = self.initial_balance
|
|
|
|
|
|
|
|
|
self.prev_action = 0.0
|
|
|
|
|
|
|
|
|
self.A_t = 0.0
|
|
|
self.B_t = 0.0
|
|
|
|
|
|
|
|
|
self.long_steps = 0
|
|
|
self.short_steps = 0
|
|
|
self.neutral_steps = 0
|
|
|
self.num_trades = 0
|
|
|
|
|
|
|
|
|
|
|
|
if self.training_mode:
|
|
|
self.flip_sign = -1.0 if np.random.random() < 0.5 else 1.0
|
|
|
else:
|
|
|
self.flip_sign = 1.0
|
|
|
|
|
|
return self._get_obs()
|
|
|
|
|
|
def _get_obs(self):
|
|
|
idx = self.start_idx + self.current_step
|
|
|
features = self.df.loc[idx, self.feature_cols].values.copy()
|
|
|
|
|
|
|
|
|
|
|
|
if self.flip_sign < 0:
|
|
|
for i, col in enumerate(self.feature_cols):
|
|
|
if any(x in col.lower() for x in ['returns', 'roc', 'macd', 'cci', 'obv', 'sentiment']):
|
|
|
features[i] *= self.flip_sign
|
|
|
|
|
|
total_return = (self.total_value / self.initial_balance) - 1
|
|
|
drawdown = (self.max_value - self.total_value) / self.max_value if self.max_value > 0 else 0
|
|
|
|
|
|
|
|
|
market_return = self.df.loc[idx, 'returns_1_15m'] * self.flip_sign
|
|
|
|
|
|
portfolio_info = np.array([
|
|
|
self.position,
|
|
|
total_return,
|
|
|
drawdown,
|
|
|
market_return,
|
|
|
self.df.loc[idx, 'rsi_14_15m'],
|
|
|
self.prev_action
|
|
|
], dtype=np.float32)
|
|
|
|
|
|
obs = np.concatenate([features, portfolio_info])
|
|
|
return np.clip(obs, -10, 10).astype(np.float32)
|
|
|
|
|
|
def _calculate_dsr(self, return_t):
|
|
|
"""
|
|
|
Calculate Differential Sharpe Ratio reward.
|
|
|
DSR = (B_{t-1} * ΔA_t - 0.5 * A_{t-1} * ΔB_t) / (B_{t-1} - A_{t-1}^2)^1.5
|
|
|
"""
|
|
|
eta = self.dsr_eta
|
|
|
|
|
|
A_prev = self.A_t
|
|
|
B_prev = self.B_t
|
|
|
|
|
|
delta_A = eta * (return_t - A_prev)
|
|
|
delta_B = eta * (return_t**2 - B_prev)
|
|
|
|
|
|
self.A_t = A_prev + delta_A
|
|
|
self.B_t = B_prev + delta_B
|
|
|
|
|
|
variance = B_prev - A_prev**2
|
|
|
|
|
|
if variance <= 1e-8:
|
|
|
return return_t
|
|
|
|
|
|
dsr = (B_prev * delta_A - 0.5 * A_prev * delta_B) / (variance ** 1.5 + 1e-8)
|
|
|
return np.clip(dsr, -0.5, 0.5)
|
|
|
|
|
|
def step(self, action):
|
|
|
idx = self.start_idx + self.current_step
|
|
|
current_price = self.df.loc[idx, 'close']
|
|
|
target_position = np.clip(action[0], -1.0, 1.0)
|
|
|
|
|
|
self.prev_total_value = self.total_value
|
|
|
|
|
|
|
|
|
if abs(target_position - self.position) > 0.1:
|
|
|
if self.position != 0:
|
|
|
self._close_position(current_price)
|
|
|
if abs(target_position) > 0.1:
|
|
|
self._open_position(target_position, current_price)
|
|
|
self.num_trades += 1
|
|
|
|
|
|
self._update_total_value(current_price)
|
|
|
self.max_value = max(self.max_value, self.total_value)
|
|
|
|
|
|
|
|
|
if self.position > 0.1:
|
|
|
self.long_steps += 1
|
|
|
elif self.position < -0.1:
|
|
|
self.short_steps += 1
|
|
|
else:
|
|
|
self.neutral_steps += 1
|
|
|
|
|
|
self.current_step += 1
|
|
|
done = (self.current_step >= self.episode_length) or (self.total_value <= self.initial_balance * 0.5)
|
|
|
|
|
|
|
|
|
raw_return = (self.total_value - self.prev_total_value) / self.initial_balance
|
|
|
|
|
|
|
|
|
raw_return *= self.flip_sign
|
|
|
|
|
|
|
|
|
if self.current_step < self.dsr_warmup_steps:
|
|
|
reward = -0.0001
|
|
|
else:
|
|
|
reward = self._calculate_dsr(raw_return)
|
|
|
|
|
|
self.prev_action = target_position
|
|
|
|
|
|
obs = self._get_obs()
|
|
|
info = {
|
|
|
'total_value': self.total_value,
|
|
|
'position': self.position,
|
|
|
'long_steps': self.long_steps,
|
|
|
'short_steps': self.short_steps,
|
|
|
'neutral_steps': self.neutral_steps,
|
|
|
'num_trades': self.num_trades,
|
|
|
'current_fee': self.current_fee,
|
|
|
'flip_sign': self.flip_sign,
|
|
|
'raw_return': raw_return,
|
|
|
'dsr_reward': reward
|
|
|
}
|
|
|
|
|
|
return obs, reward, done, info
|
|
|
|
|
|
def _update_total_value(self, current_price):
|
|
|
if self.position != 0:
|
|
|
if self.position > 0:
|
|
|
pnl = self.position * self.initial_balance * (current_price / self.entry_price - 1)
|
|
|
else:
|
|
|
pnl = abs(self.position) * self.initial_balance * (1 - current_price / self.entry_price)
|
|
|
self.total_value = self.balance + pnl
|
|
|
else:
|
|
|
self.total_value = self.balance
|
|
|
|
|
|
def _open_position(self, size, price):
|
|
|
self.position = size
|
|
|
self.entry_price = price
|
|
|
fee_cost = abs(size) * self.initial_balance * self.current_fee
|
|
|
self.balance -= fee_cost
|
|
|
|
|
|
def _close_position(self, price):
|
|
|
if self.position > 0:
|
|
|
pnl = self.position * self.initial_balance * (price / self.entry_price - 1)
|
|
|
else:
|
|
|
pnl = abs(self.position) * self.initial_balance * (1 - price / self.entry_price)
|
|
|
|
|
|
fee_cost = abs(pnl) * self.current_fee
|
|
|
self.balance += pnl - fee_cost
|
|
|
self.position = 0.0
|
|
|
|
|
|
print("✅ Environment class ready:")
|
|
|
print(" - DSR reward with 100-step warmup")
|
|
|
print(" - Random flip augmentation (50% probability)")
|
|
|
print(" - Previous action in state")
|
|
|
print(" - Transaction fee ramping")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("="*70)
|
|
|
print(" LOADING SENTIMENT DATA")
|
|
|
print("="*70)
|
|
|
|
|
|
sentiment_file = '/kaggle/input/bitcoin-news-with-sentimen/bitcoin_news_3hour_intervals_with_sentiment.csv'
|
|
|
|
|
|
try:
|
|
|
sentiment_raw = pd.read_csv(sentiment_file)
|
|
|
|
|
|
def parse_time_range(time_str):
|
|
|
parts = str(time_str).split(' ')
|
|
|
if len(parts) >= 2:
|
|
|
date = parts[0]
|
|
|
time_range = parts[1]
|
|
|
start_time = time_range.split('-')[0]
|
|
|
return f"{date} {start_time}:00"
|
|
|
return time_str
|
|
|
|
|
|
sentiment_raw['timestamp'] = sentiment_raw['time_interval'].apply(parse_time_range)
|
|
|
sentiment_raw['timestamp'] = pd.to_datetime(sentiment_raw['timestamp'])
|
|
|
sentiment_raw = sentiment_raw.set_index('timestamp').sort_index()
|
|
|
|
|
|
sentiment_clean = pd.DataFrame(index=sentiment_raw.index)
|
|
|
sentiment_clean['prob_bullish'] = pd.to_numeric(sentiment_raw['prob_bullish'], errors='coerce')
|
|
|
sentiment_clean['prob_bearish'] = pd.to_numeric(sentiment_raw['prob_bearish'], errors='coerce')
|
|
|
sentiment_clean['prob_neutral'] = pd.to_numeric(sentiment_raw['prob_neutral'], errors='coerce')
|
|
|
sentiment_clean['confidence'] = pd.to_numeric(sentiment_raw['sentiment_confidence'], errors='coerce')
|
|
|
sentiment_clean = sentiment_clean.dropna()
|
|
|
|
|
|
|
|
|
for df in [train_data, valid_data, test_data]:
|
|
|
df_temp = df.join(sentiment_clean, how='left')
|
|
|
for col in ['prob_bullish', 'prob_bearish', 'prob_neutral', 'confidence']:
|
|
|
df[col] = df_temp[col].fillna(method='ffill').fillna(method='bfill').fillna(0.33 if col != 'confidence' else 0.5)
|
|
|
|
|
|
df['sentiment_net'] = df['prob_bullish'] - df['prob_bearish']
|
|
|
df['sentiment_strength'] = (df['prob_bullish'] - df['prob_bearish']).abs()
|
|
|
df['sentiment_weighted'] = df['sentiment_net'] * df['confidence']
|
|
|
|
|
|
print(f"✅ Sentiment loaded: {len(sentiment_clean):,} records")
|
|
|
print(f"✅ Features added: 7 sentiment features")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"⚠️ Sentiment not loaded: {e}")
|
|
|
for df in [train_data, valid_data, test_data]:
|
|
|
df['sentiment_net'] = 0
|
|
|
df['sentiment_strength'] = 0
|
|
|
df['sentiment_weighted'] = 0
|
|
|
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("="*70)
|
|
|
print(" ROLLING NORMALIZATION + CREATING ENVIRONMENTS")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
feature_cols = [col for col in train_data.columns
|
|
|
if col not in ['open', 'high', 'low', 'close', 'volume', 'fgi', 'fgi_ma7']]
|
|
|
|
|
|
print(f"📊 Total features: {len(feature_cols)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rolling_normalizer = RollingNormalizer(window_size=2880)
|
|
|
|
|
|
print("🔄 Applying rolling normalization (window=2880)...")
|
|
|
|
|
|
|
|
|
train_data_norm = rolling_normalizer.fit_transform(train_data, feature_cols)
|
|
|
valid_data_norm = rolling_normalizer.fit_transform(valid_data, feature_cols)
|
|
|
test_data_norm = rolling_normalizer.fit_transform(test_data, feature_cols)
|
|
|
|
|
|
print("✅ Rolling normalization complete (no look-ahead bias!)")
|
|
|
|
|
|
|
|
|
train_env = BitcoinTradingEnv(train_data_norm, episode_length=500)
|
|
|
valid_env = BitcoinTradingEnv(valid_data_norm, episode_length=500)
|
|
|
test_env = BitcoinTradingEnv(test_data_norm, episode_length=500)
|
|
|
|
|
|
state_dim = train_env.observation_space.shape[0]
|
|
|
action_dim = 1
|
|
|
|
|
|
print(f"\n✅ Environments created:")
|
|
|
print(f" State dim: {state_dim} (features={len(feature_cols)} + portfolio=6)")
|
|
|
print(f" Action dim: {action_dim}")
|
|
|
print(f" Train samples: {len(train_data):,}")
|
|
|
print(f" Fee starts at: 0% (ramps to 0.1% after warmup)")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
import torch.nn.functional as F
|
|
|
import torch.optim as optim
|
|
|
from torch.distributions import Normal
|
|
|
|
|
|
print("="*70)
|
|
|
print(" PYTORCH SAC AGENT")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Actor(nn.Module):
|
|
|
def __init__(self, state_dim, action_dim, hidden_dim=512):
|
|
|
super().__init__()
|
|
|
|
|
|
self.fc1 = nn.Linear(state_dim, hidden_dim)
|
|
|
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
|
|
|
self.fc3 = nn.Linear(hidden_dim, hidden_dim // 2)
|
|
|
|
|
|
self.mean = nn.Linear(hidden_dim // 2, action_dim)
|
|
|
self.log_std = nn.Linear(hidden_dim // 2, action_dim)
|
|
|
|
|
|
self.LOG_STD_MIN = -20
|
|
|
self.LOG_STD_MAX = 2
|
|
|
|
|
|
def forward(self, state):
|
|
|
x = F.relu(self.fc1(state))
|
|
|
x = F.relu(self.fc2(x))
|
|
|
x = F.relu(self.fc3(x))
|
|
|
|
|
|
mean = self.mean(x)
|
|
|
log_std = self.log_std(x)
|
|
|
log_std = torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX)
|
|
|
|
|
|
return mean, log_std
|
|
|
|
|
|
def sample(self, state):
|
|
|
mean, log_std = self.forward(state)
|
|
|
std = log_std.exp()
|
|
|
|
|
|
normal = Normal(mean, std)
|
|
|
x_t = normal.rsample()
|
|
|
action = torch.tanh(x_t)
|
|
|
|
|
|
|
|
|
log_prob = normal.log_prob(x_t)
|
|
|
log_prob -= torch.log(1 - action.pow(2) + 1e-6)
|
|
|
log_prob = log_prob.sum(dim=-1, keepdim=True)
|
|
|
|
|
|
return action, log_prob, mean
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Critic(nn.Module):
|
|
|
def __init__(self, state_dim, action_dim, hidden_dim=512):
|
|
|
super().__init__()
|
|
|
|
|
|
self.fc1_1 = nn.Linear(state_dim + action_dim, hidden_dim)
|
|
|
self.fc1_2 = nn.Linear(hidden_dim, hidden_dim)
|
|
|
self.fc1_3 = nn.Linear(hidden_dim, hidden_dim // 2)
|
|
|
self.fc1_out = nn.Linear(hidden_dim // 2, 1)
|
|
|
|
|
|
|
|
|
self.fc2_1 = nn.Linear(state_dim + action_dim, hidden_dim)
|
|
|
self.fc2_2 = nn.Linear(hidden_dim, hidden_dim)
|
|
|
self.fc2_3 = nn.Linear(hidden_dim, hidden_dim // 2)
|
|
|
self.fc2_out = nn.Linear(hidden_dim // 2, 1)
|
|
|
|
|
|
def forward(self, state, action):
|
|
|
x = torch.cat([state, action], dim=-1)
|
|
|
|
|
|
|
|
|
q1 = F.relu(self.fc1_1(x))
|
|
|
q1 = F.relu(self.fc1_2(q1))
|
|
|
q1 = F.relu(self.fc1_3(q1))
|
|
|
q1 = self.fc1_out(q1)
|
|
|
|
|
|
|
|
|
q2 = F.relu(self.fc2_1(x))
|
|
|
q2 = F.relu(self.fc2_2(q2))
|
|
|
q2 = F.relu(self.fc2_3(q2))
|
|
|
q2 = self.fc2_out(q2)
|
|
|
|
|
|
return q1, q2
|
|
|
|
|
|
def q1(self, state, action):
|
|
|
x = torch.cat([state, action], dim=-1)
|
|
|
q1 = F.relu(self.fc1_1(x))
|
|
|
q1 = F.relu(self.fc1_2(q1))
|
|
|
q1 = F.relu(self.fc1_3(q1))
|
|
|
return self.fc1_out(q1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SACAgent:
|
|
|
def __init__(self, state_dim, action_dim, device,
|
|
|
actor_lr=3e-4, critic_lr=3e-4, alpha_lr=3e-4,
|
|
|
gamma=0.99, tau=0.005, initial_alpha=0.2):
|
|
|
|
|
|
self.device = device
|
|
|
self.gamma = gamma
|
|
|
self.tau = tau
|
|
|
self.action_dim = action_dim
|
|
|
|
|
|
|
|
|
self.actor = Actor(state_dim, action_dim).to(device)
|
|
|
self.critic = Critic(state_dim, action_dim).to(device)
|
|
|
self.critic_target = Critic(state_dim, action_dim).to(device)
|
|
|
self.critic_target.load_state_dict(self.critic.state_dict())
|
|
|
|
|
|
|
|
|
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
|
|
|
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
|
|
|
|
|
|
|
|
|
self.target_entropy = -action_dim
|
|
|
self.log_alpha = torch.tensor(np.log(initial_alpha), requires_grad=True, device=device)
|
|
|
self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr)
|
|
|
|
|
|
@property
|
|
|
def alpha(self):
|
|
|
return self.log_alpha.exp()
|
|
|
|
|
|
def select_action(self, state, deterministic=False):
|
|
|
with torch.no_grad():
|
|
|
state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
|
|
|
if deterministic:
|
|
|
mean, _ = self.actor(state)
|
|
|
action = torch.tanh(mean)
|
|
|
else:
|
|
|
action, _, _ = self.actor.sample(state)
|
|
|
return action.cpu().numpy()[0]
|
|
|
|
|
|
def update(self, batch):
|
|
|
states, actions, rewards, next_states, dones = batch
|
|
|
|
|
|
states = torch.FloatTensor(states).to(self.device)
|
|
|
actions = torch.FloatTensor(actions).to(self.device)
|
|
|
rewards = torch.FloatTensor(rewards).unsqueeze(1).to(self.device)
|
|
|
next_states = torch.FloatTensor(next_states).to(self.device)
|
|
|
dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)
|
|
|
|
|
|
|
|
|
with torch.no_grad():
|
|
|
next_actions, next_log_probs, _ = self.actor.sample(next_states)
|
|
|
q1_target, q2_target = self.critic_target(next_states, next_actions)
|
|
|
q_target = torch.min(q1_target, q2_target)
|
|
|
target_q = rewards + (1 - dones) * self.gamma * (q_target - self.alpha * next_log_probs)
|
|
|
|
|
|
q1, q2 = self.critic(states, actions)
|
|
|
critic_loss = F.mse_loss(q1, target_q) + F.mse_loss(q2, target_q)
|
|
|
|
|
|
self.critic_optimizer.zero_grad()
|
|
|
critic_loss.backward()
|
|
|
self.critic_optimizer.step()
|
|
|
|
|
|
|
|
|
new_actions, log_probs, _ = self.actor.sample(states)
|
|
|
q1_new, q2_new = self.critic(states, new_actions)
|
|
|
q_new = torch.min(q1_new, q2_new)
|
|
|
actor_loss = (self.alpha * log_probs - q_new).mean()
|
|
|
|
|
|
self.actor_optimizer.zero_grad()
|
|
|
actor_loss.backward()
|
|
|
self.actor_optimizer.step()
|
|
|
|
|
|
|
|
|
alpha_loss = -(self.log_alpha * (log_probs.detach() + self.target_entropy)).mean()
|
|
|
|
|
|
self.alpha_optimizer.zero_grad()
|
|
|
alpha_loss.backward()
|
|
|
self.alpha_optimizer.step()
|
|
|
|
|
|
|
|
|
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
|
|
|
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
|
|
|
|
|
|
return {
|
|
|
'critic_loss': critic_loss.item(),
|
|
|
'actor_loss': actor_loss.item(),
|
|
|
'alpha': self.alpha.item()
|
|
|
}
|
|
|
|
|
|
print("✅ Actor: 512→512→256→1")
|
|
|
print("✅ Critic: Twin Q (512→512→256→1)")
|
|
|
print("✅ SAC Agent with auto-tuning alpha")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("="*70)
|
|
|
print(" REPLAY BUFFER")
|
|
|
print("="*70)
|
|
|
|
|
|
class ReplayBuffer:
|
|
|
def __init__(self, state_dim, action_dim, max_size=1_000_000):
|
|
|
self.max_size = max_size
|
|
|
self.ptr = 0
|
|
|
self.size = 0
|
|
|
|
|
|
self.states = np.zeros((max_size, state_dim), dtype=np.float32)
|
|
|
self.actions = np.zeros((max_size, action_dim), dtype=np.float32)
|
|
|
self.rewards = np.zeros((max_size, 1), dtype=np.float32)
|
|
|
self.next_states = np.zeros((max_size, state_dim), dtype=np.float32)
|
|
|
self.dones = np.zeros((max_size, 1), dtype=np.float32)
|
|
|
|
|
|
mem_gb = (self.states.nbytes + self.actions.nbytes + self.rewards.nbytes +
|
|
|
self.next_states.nbytes + self.dones.nbytes) / 1e9
|
|
|
print(f"📦 Buffer capacity: {max_size:,} | Memory: {mem_gb:.2f} GB")
|
|
|
|
|
|
def add(self, state, action, reward, next_state, done):
|
|
|
self.states[self.ptr] = state
|
|
|
self.actions[self.ptr] = action
|
|
|
self.rewards[self.ptr] = reward
|
|
|
self.next_states[self.ptr] = next_state
|
|
|
self.dones[self.ptr] = done
|
|
|
|
|
|
self.ptr = (self.ptr + 1) % self.max_size
|
|
|
self.size = min(self.size + 1, self.max_size)
|
|
|
|
|
|
def sample(self, batch_size):
|
|
|
idx = np.random.randint(0, self.size, size=batch_size)
|
|
|
return (
|
|
|
self.states[idx],
|
|
|
self.actions[idx],
|
|
|
self.rewards[idx],
|
|
|
self.next_states[idx],
|
|
|
self.dones[idx]
|
|
|
)
|
|
|
|
|
|
print("✅ ReplayBuffer defined")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("="*70)
|
|
|
print(" CREATING AGENT + BUFFER")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
agent = SACAgent(
|
|
|
state_dim=state_dim,
|
|
|
action_dim=action_dim,
|
|
|
device=device,
|
|
|
actor_lr=3e-4,
|
|
|
critic_lr=3e-4,
|
|
|
alpha_lr=3e-4,
|
|
|
gamma=0.99,
|
|
|
tau=0.005,
|
|
|
initial_alpha=0.2
|
|
|
)
|
|
|
|
|
|
|
|
|
buffer = ReplayBuffer(
|
|
|
state_dim=state_dim,
|
|
|
action_dim=action_dim,
|
|
|
max_size=1_000_000
|
|
|
)
|
|
|
|
|
|
|
|
|
total_params = sum(p.numel() for p in agent.actor.parameters()) + \
|
|
|
sum(p.numel() for p in agent.critic.parameters())
|
|
|
|
|
|
print(f"\n✅ Agent created on {device}")
|
|
|
print(f" Actor params: {sum(p.numel() for p in agent.actor.parameters()):,}")
|
|
|
print(f" Critic params: {sum(p.numel() for p in agent.critic.parameters()):,}")
|
|
|
print(f" Total params: {total_params:,}")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from tqdm.notebook import tqdm
|
|
|
import time
|
|
|
|
|
|
print("="*70)
|
|
|
print(" TRAINING FUNCTION")
|
|
|
print("="*70)
|
|
|
|
|
|
def train_sac(agent, env, valid_env, buffer,
|
|
|
total_timesteps=700_000,
|
|
|
warmup_steps=10_000,
|
|
|
batch_size=1024,
|
|
|
update_freq=1,
|
|
|
fee_warmup_steps=100_000,
|
|
|
fee_ramp_steps=100_000,
|
|
|
save_path="sac_v9"):
|
|
|
|
|
|
print(f"\n🚀 Training Configuration:")
|
|
|
print(f" Total steps: {total_timesteps:,}")
|
|
|
print(f" Warmup: {warmup_steps:,}")
|
|
|
print(f" Batch size: {batch_size}")
|
|
|
print(f" Fee warmup: {fee_warmup_steps:,} steps (then ramp over {fee_ramp_steps:,})")
|
|
|
print(f" Data augmentation: Random flips (50% probability)")
|
|
|
print(f" DSR warmup: 100 steps per episode (0 reward)")
|
|
|
print(f" Device: {agent.device}")
|
|
|
|
|
|
|
|
|
env.set_training_mode(True)
|
|
|
valid_env.set_training_mode(False)
|
|
|
|
|
|
|
|
|
episode_rewards = []
|
|
|
episode_lengths = []
|
|
|
eval_rewards = []
|
|
|
best_reward = -np.inf
|
|
|
best_eval = -np.inf
|
|
|
|
|
|
|
|
|
critic_losses = []
|
|
|
actor_losses = []
|
|
|
|
|
|
state = env.reset()
|
|
|
episode_reward = 0
|
|
|
episode_length = 0
|
|
|
episode_count = 0
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
pbar = tqdm(range(total_timesteps), desc="Training")
|
|
|
|
|
|
for step in pbar:
|
|
|
|
|
|
|
|
|
if step < fee_warmup_steps:
|
|
|
fee_multiplier = 0.0
|
|
|
else:
|
|
|
progress = (step - fee_warmup_steps) / fee_ramp_steps
|
|
|
fee_multiplier = min(1.0, progress)
|
|
|
|
|
|
env.set_fee_multiplier(fee_multiplier)
|
|
|
valid_env.set_fee_multiplier(fee_multiplier)
|
|
|
|
|
|
|
|
|
if step < warmup_steps:
|
|
|
action = env.action_space.sample()
|
|
|
else:
|
|
|
action = agent.select_action(state, deterministic=False)
|
|
|
|
|
|
|
|
|
next_state, reward, done, info = env.step(action)
|
|
|
|
|
|
|
|
|
buffer.add(state, action, reward, next_state, float(done))
|
|
|
|
|
|
state = next_state
|
|
|
episode_reward += reward
|
|
|
episode_length += 1
|
|
|
|
|
|
|
|
|
stats = None
|
|
|
if step >= warmup_steps and step % update_freq == 0:
|
|
|
batch = buffer.sample(batch_size)
|
|
|
stats = agent.update(batch)
|
|
|
critic_losses.append(stats['critic_loss'])
|
|
|
actor_losses.append(stats['actor_loss'])
|
|
|
|
|
|
|
|
|
if done:
|
|
|
episode_rewards.append(episode_reward)
|
|
|
episode_lengths.append(episode_length)
|
|
|
episode_count += 1
|
|
|
|
|
|
|
|
|
final_value = info.get('total_value', 10000)
|
|
|
pnl_pct = (final_value / 10000 - 1) * 100
|
|
|
num_trades = info.get('num_trades', 0)
|
|
|
current_fee = info.get('current_fee', 0) * 100
|
|
|
|
|
|
|
|
|
long_steps = info.get('long_steps', 0)
|
|
|
short_steps = info.get('short_steps', 0)
|
|
|
neutral_steps = info.get('neutral_steps', 0)
|
|
|
total_active = long_steps + short_steps
|
|
|
long_pct = (long_steps / total_active * 100) if total_active > 0 else 0
|
|
|
short_pct = (short_steps / total_active * 100) if total_active > 0 else 0
|
|
|
|
|
|
|
|
|
avg_reward = np.mean(episode_rewards[-10:]) if len(episode_rewards) >= 10 else episode_reward
|
|
|
avg_critic = np.mean(critic_losses[-100:]) if critic_losses else 0
|
|
|
|
|
|
pbar.set_postfix({
|
|
|
'ep': episode_count,
|
|
|
'R': f'{episode_reward:.4f}',
|
|
|
'avg10': f'{avg_reward:.4f}',
|
|
|
'PnL%': f'{pnl_pct:+.2f}',
|
|
|
'L/S': f'{long_pct:.0f}/{short_pct:.0f}',
|
|
|
'fee%': f'{current_fee:.3f}',
|
|
|
'α': f'{agent.alpha.item():.3f}',
|
|
|
})
|
|
|
|
|
|
|
|
|
eval_reward, eval_pnl, eval_long_pct = evaluate_agent(agent, valid_env, n_episodes=1)
|
|
|
eval_rewards.append(eval_reward)
|
|
|
|
|
|
|
|
|
elapsed = time.time() - start_time
|
|
|
steps_per_sec = (step + 1) / elapsed
|
|
|
|
|
|
print(f"\n{'='*60}")
|
|
|
print(f"📊 Episode {episode_count} Complete | Step {step+1:,}/{total_timesteps:,}")
|
|
|
print(f"{'='*60}")
|
|
|
print(f" 🎮 TRAIN:")
|
|
|
print(f" Reward (DSR): {episode_reward:.4f} | PnL: {pnl_pct:+.2f}%")
|
|
|
print(f" Length: {episode_length} steps | Trades: {num_trades}")
|
|
|
print(f" Avg (last 10): {avg_reward:.4f}")
|
|
|
print(f" 📊 POSITION BALANCE:")
|
|
|
print(f" Long: {long_steps} steps ({long_pct:.1f}%)")
|
|
|
print(f" Short: {short_steps} steps ({short_pct:.1f}%)")
|
|
|
print(f" Neutral: {neutral_steps} steps")
|
|
|
print(f" 💰 FEE CURRICULUM:")
|
|
|
print(f" Current fee: {current_fee:.4f}% (multiplier: {fee_multiplier:.2f})")
|
|
|
print(f" 📈 EVAL (validation):")
|
|
|
print(f" Reward: {eval_reward:.4f} | PnL: {eval_pnl:+.2f}%")
|
|
|
print(f" Long%: {eval_long_pct:.1f}%")
|
|
|
print(f" Avg (last 5): {np.mean(eval_rewards[-5:]):.4f}")
|
|
|
print(f" 🧠 AGENT:")
|
|
|
print(f" Alpha: {agent.alpha.item():.4f}")
|
|
|
print(f" Critic loss: {avg_critic:.5f}")
|
|
|
print(f" ⚡ Speed: {steps_per_sec:.0f} steps/sec")
|
|
|
print(f" 💾 Buffer: {buffer.size:,} transitions")
|
|
|
|
|
|
|
|
|
if episode_reward > best_reward:
|
|
|
best_reward = episode_reward
|
|
|
torch.save({
|
|
|
'actor': agent.actor.state_dict(),
|
|
|
'critic': agent.critic.state_dict(),
|
|
|
'critic_target': agent.critic_target.state_dict(),
|
|
|
'log_alpha': agent.log_alpha,
|
|
|
}, f"{save_path}_best_train.pt")
|
|
|
print(f" 🏆 NEW BEST TRAIN: {best_reward:.4f}")
|
|
|
|
|
|
|
|
|
if eval_reward > best_eval:
|
|
|
best_eval = eval_reward
|
|
|
torch.save({
|
|
|
'actor': agent.actor.state_dict(),
|
|
|
'critic': agent.critic.state_dict(),
|
|
|
'critic_target': agent.critic_target.state_dict(),
|
|
|
'log_alpha': agent.log_alpha,
|
|
|
}, f"{save_path}_best_eval.pt")
|
|
|
print(f" 🏆 NEW BEST EVAL: {best_eval:.4f}")
|
|
|
|
|
|
|
|
|
state = env.reset()
|
|
|
episode_reward = 0
|
|
|
episode_length = 0
|
|
|
|
|
|
|
|
|
torch.save({
|
|
|
'actor': agent.actor.state_dict(),
|
|
|
'critic': agent.critic.state_dict(),
|
|
|
'critic_target': agent.critic_target.state_dict(),
|
|
|
'log_alpha': agent.log_alpha,
|
|
|
}, f"{save_path}_final.pt")
|
|
|
|
|
|
total_time = time.time() - start_time
|
|
|
print(f"\n{'='*70}")
|
|
|
print(f" TRAINING COMPLETE")
|
|
|
print(f"{'='*70}")
|
|
|
print(f" Total time: {total_time/60:.1f} min")
|
|
|
print(f" Episodes: {episode_count}")
|
|
|
print(f" Best train reward (DSR): {best_reward:.4f}")
|
|
|
print(f" Best eval reward (DSR): {best_eval:.4f}")
|
|
|
print(f" Avg speed: {total_timesteps/total_time:.0f} steps/sec")
|
|
|
|
|
|
return episode_rewards, eval_rewards
|
|
|
|
|
|
|
|
|
def evaluate_agent(agent, env, n_episodes=1):
|
|
|
"""Run evaluation episodes"""
|
|
|
total_reward = 0
|
|
|
total_pnl = 0
|
|
|
total_long_pct = 0
|
|
|
|
|
|
for _ in range(n_episodes):
|
|
|
state = env.reset()
|
|
|
episode_reward = 0
|
|
|
done = False
|
|
|
|
|
|
while not done:
|
|
|
action = agent.select_action(state, deterministic=True)
|
|
|
state, reward, done, info = env.step(action)
|
|
|
episode_reward += reward
|
|
|
|
|
|
total_reward += episode_reward
|
|
|
final_value = info.get('total_value', 10000)
|
|
|
total_pnl += (final_value / 10000 - 1) * 100
|
|
|
|
|
|
|
|
|
long_steps = info.get('long_steps', 0)
|
|
|
short_steps = info.get('short_steps', 0)
|
|
|
total_active = long_steps + short_steps
|
|
|
total_long_pct += (long_steps / total_active * 100) if total_active > 0 else 0
|
|
|
|
|
|
return total_reward / n_episodes, total_pnl / n_episodes, total_long_pct / n_episodes
|
|
|
|
|
|
|
|
|
print("✅ Training function ready:")
|
|
|
print(" - Per-episode eval + position tracking")
|
|
|
print(" - DSR reward (risk-adjusted)")
|
|
|
print(" - Fee ramping: 0% → 0.1% after 100k steps")
|
|
|
print(" - Model checkpointing")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("="*70)
|
|
|
print(" STARTING SAC TRAINING")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
TOTAL_STEPS = 500_000
|
|
|
WARMUP_STEPS = 10_000
|
|
|
BATCH_SIZE = 256
|
|
|
UPDATE_FREQ = 1
|
|
|
FEE_WARMUP = 100_000
|
|
|
FEE_RAMP = 100_000
|
|
|
|
|
|
print(f"\n📋 Configuration:")
|
|
|
print(f" Steps: {TOTAL_STEPS:,}")
|
|
|
print(f" Batch: {BATCH_SIZE}")
|
|
|
print(f" Train env: {len(train_data):,} candles")
|
|
|
print(f" Valid env: {len(valid_data):,} candles")
|
|
|
print(f" Device: {device}")
|
|
|
print(f"\n💰 Fee Curriculum:")
|
|
|
print(f" Steps 0-{FEE_WARMUP:,}: 0% fee (learn basic trading)")
|
|
|
print(f" Steps {FEE_WARMUP:,}-{FEE_WARMUP+FEE_RAMP:,}: Ramp 0%→0.1%")
|
|
|
print(f" Steps {FEE_WARMUP+FEE_RAMP:,}+: Full 0.1% fee")
|
|
|
print(f"\n🎯 Reward: Differential Sharpe Ratio (DSR)")
|
|
|
print(f" - Risk-adjusted returns (not just PnL)")
|
|
|
print(f" - Small values (-0.5 to 0.5) are normal")
|
|
|
print(f" - NOT normalized further")
|
|
|
|
|
|
|
|
|
episode_rewards, eval_rewards = train_sac(
|
|
|
agent=agent,
|
|
|
env=train_env,
|
|
|
valid_env=valid_env,
|
|
|
buffer=buffer,
|
|
|
total_timesteps=TOTAL_STEPS,
|
|
|
warmup_steps=WARMUP_STEPS,
|
|
|
batch_size=BATCH_SIZE,
|
|
|
update_freq=UPDATE_FREQ,
|
|
|
fee_warmup_steps=FEE_WARMUP,
|
|
|
fee_ramp_steps=FEE_RAMP,
|
|
|
save_path="sac_v9_pytorch"
|
|
|
)
|
|
|
|
|
|
print("\n" + "="*70)
|
|
|
print(" TRAINING COMPLETE")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|