Premchan369 commited on
Commit
96e73d5
·
verified ·
1 Parent(s): 01cc2df

Add market microstructure features: Kyle's lambda, VPIN, Roll measure, OFI, Amihud

Browse files
Files changed (1) hide show
  1. market_microstructure.py +382 -0
market_microstructure.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Market Microstructure Features
2
+
3
+ Based on Marcos Lopez de Prado and the mlfinlab library.
4
+
5
+ This is what separates retail technical analysis from institutional quant.
6
+ Order flow, liquidity, and market impact contain genuine alpha.
7
+ """
8
+ import numpy as np
9
+ import pandas as pd
10
+ from typing import Dict, List, Optional, Tuple
11
+ import warnings
12
+ warnings.filterwarnings('ignore')
13
+
14
+
15
+ class MicrostructureFeatures:
16
+ """
17
+ Extract market microstructure features from tick-level data.
18
+
19
+ Key insight: The market is not a continuous price stream.
20
+ It is a series of discrete transactions driven by informed vs.
21
+ uninformed traders. Microstructure features detect this.
22
+ """
23
+
24
+ @staticmethod
25
+ def bid_ask_spread(bid: pd.Series, ask: pd.Series) -> pd.Series:
26
+ """
27
+ Raw bid-ask spread.
28
+
29
+ Wider spreads = lower liquidity, higher execution cost.
30
+ """
31
+ return ask - bid
32
+
33
+ @staticmethod
34
+ def relative_spread(bid: pd.Series, ask: pd.Series,
35
+ mid: Optional[pd.Series] = None) -> pd.Series:
36
+ """
37
+ Spread as percentage of mid price.
38
+ """
39
+ if mid is None:
40
+ mid = (bid + ask) / 2
41
+ return (ask - bid) / mid
42
+
43
+ @staticmethod
44
+ def effective_spread(price: pd.Series, bid: pd.Series,
45
+ ask: pd.Series) -> pd.Series:
46
+ """
47
+ Effective spread = 2 * |trade_price - mid_price|.
48
+
49
+ Measures actual execution cost vs. quoted spread.
50
+ """
51
+ mid = (bid + ask) / 2
52
+ return 2 * np.abs(price - mid) / mid
53
+
54
+ @staticmethod
55
+ def realized_spread(price: pd.Series, bid: pd.Series, ask: pd.Series,
56
+ future_mid: pd.Series) -> pd.Series:
57
+ """
58
+ Realized spread = 2 * |trade_price - future_mid|.
59
+
60
+ Measures adverse selection. If realized spread > effective spread,
61
+ your trade moved the market against you.
62
+ """
63
+ mid = (bid + ask) / 2
64
+ return 2 * np.abs(price - future_mid) / mid
65
+
66
+ @staticmethod
67
+ def price_impact(price: pd.Series, volume: pd.Series,
68
+ bid: pd.Series, ask: pd.Series) -> pd.Series:
69
+ """
70
+ Kyle's Lambda — price impact coefficient.
71
+
72
+ delta_price = lambda * signed_volume + noise
73
+
74
+ Higher lambda = less liquid market, your orders move prices more.
75
+ """
76
+ mid = (bid + ask) / 2
77
+ mid_change = mid.diff()
78
+
79
+ # Signed volume: Lee-Ready tick test
80
+ signed_vol = np.where(
81
+ price > mid.shift(1), volume,
82
+ np.where(price < mid.shift(1), -volume, 0)
83
+ )
84
+
85
+ # Rolling regression via covariance/variance ratio
86
+ return pd.Series(signed_vol).rolling(100).cov(
87
+ pd.Series(mid_change).rolling(100)
88
+ ) / pd.Series(signed_vol).rolling(100).var().replace(0, np.nan)
89
+
90
+ @staticmethod
91
+ def order_flow_imbalance(bid_size: pd.Series, ask_size: pd.Series) -> pd.Series:
92
+ """
93
+ OFI = (bid_size - ask_size) / (bid_size + ask_size).
94
+
95
+ Positive = more buying pressure = bullish.
96
+
97
+ This is genuine short-term alpha in liquid markets.
98
+ """
99
+ return (bid_size - ask_size) / (bid_size + ask_size + 1e-10)
100
+
101
+ @staticmethod
102
+ def volume_imbalance(buy_volume: pd.Series, sell_volume: pd.Series) -> pd.Series:
103
+ """
104
+ Volume imbalance = (buy_vol - sell_vol) / (buy_vol + sell_vol).
105
+
106
+ Classification via tick test or quote test.
107
+ """
108
+ return (buy_volume - sell_volume) / (buy_volume + sell_volume + 1e-10)
109
+
110
+ @staticmethod
111
+ def trade_sign_classification(price: pd.Series,
112
+ bid: pd.Series,
113
+ ask: pd.Series) -> pd.Series:
114
+ """
115
+ Lee-Ready tick test for trade direction classification.
116
+
117
+ If trade price > mid → buy (aggressor is buyer)
118
+ If trade price < mid → sell (aggressor is seller)
119
+ If trade price = mid → use tick test (compare to previous trade)
120
+ """
121
+ mid = (bid + ask) / 2
122
+
123
+ # Quote test
124
+ sign = np.where(price > mid, 1, np.where(price < mid, -1, 0))
125
+
126
+ # Tick test for mid-trades
127
+ price_change = price.diff()
128
+ tick_sign = np.where(
129
+ price_change > 0, 1,
130
+ np.where(price_change < 0, -1, 0)
131
+ )
132
+
133
+ # Use tick test where quote test is inconclusive
134
+ sign = np.where(sign == 0, tick_sign, sign)
135
+
136
+ # If still 0, carry forward
137
+ sign = pd.Series(sign).fillna(method='ffill').fillna(0).values
138
+
139
+ return pd.Series(sign, index=price.index)
140
+
141
+ @staticmethod
142
+ def amihud_illiquidity(price: pd.Series, volume: pd.Series,
143
+ window: int = 21) -> pd.Series:
144
+ """
145
+ Amihud illiquidity = |return| / (price * volume).
146
+
147
+ Higher = less liquid.
148
+
149
+ Used in academic literature to measure market quality.
150
+ Predicts returns (illiquid stocks earn premium).
151
+ """
152
+ returns = price.pct_change().abs()
153
+ dollar_volume = price * volume
154
+
155
+ return (returns / dollar_volume).rolling(window).mean() * 1e6
156
+
157
+ @staticmethod
158
+ def kyles_lambda(price: pd.Series, volume: pd.Series,
159
+ trade_sign: pd.Series, window: int = 100) -> pd.Series:
160
+ """
161
+ Kyle's Lambda — price impact per unit of order flow.
162
+
163
+ Lambda = Cov(delta_price, signed_volume) / Var(signed_volume)
164
+
165
+ Proxy for adverse selection and market depth.
166
+ """
167
+ delta_price = price.diff()
168
+ signed_volume = trade_sign * volume
169
+
170
+ cov = delta_price.rolling(window).cov(signed_volume)
171
+ var = signed_volume.rolling(window).var()
172
+
173
+ return cov / var.replace(0, np.nan)
174
+
175
+ @staticmethod
176
+ def vpin_approximation(price: pd.Series, volume: pd.Series,
177
+ bucket_vol: float = 10000) -> float:
178
+ """
179
+ VPIN — Volume-Synchronized Probability of Informed Trading.
180
+
181
+ Simplified approximation using equal-volume buckets.
182
+
183
+ High VPIN = high probability of informed trading = adverse selection risk.
184
+ """
185
+ # Classify trades
186
+ mid = price.rolling(2).mean()
187
+ trade_sign = np.where(price > mid.shift(1), 1, -1)
188
+
189
+ signed_volume = trade_sign * volume
190
+ buy_volume = np.where(signed_volume > 0, volume, 0)
191
+ sell_volume = np.where(signed_volume < 0, volume, 0)
192
+
193
+ # Create volume buckets
194
+ cumulative = np.cumsum(volume)
195
+ n_buckets = int(cumulative[-1] / bucket_vol)
196
+
197
+ if n_buckets < 10:
198
+ return np.nan
199
+
200
+ bucket_boundaries = np.linspace(0, cumulative[-1], n_buckets + 1)
201
+
202
+ bucket_buy = []
203
+ bucket_sell = []
204
+
205
+ for i in range(n_buckets):
206
+ mask = (cumulative >= bucket_boundaries[i]) & (cumulative < bucket_boundaries[i+1])
207
+ bucket_buy.append(np.sum(buy_volume[mask]))
208
+ bucket_sell.append(np.sum(sell_volume[mask]))
209
+
210
+ bucket_buy = np.array(bucket_buy)
211
+ bucket_sell = np.array(bucket_sell)
212
+ bucket_volume = bucket_buy + bucket_sell
213
+
214
+ # VPIN = average |buy - sell| / volume
215
+ vpin_values = np.abs(bucket_buy - bucket_sell) / (bucket_volume + 1e-10)
216
+
217
+ return np.mean(vpin_values)
218
+
219
+ @staticmethod
220
+ def roll_measure(price: pd.Series, window: int = 20) -> pd.Series:
221
+ """
222
+ Roll's measure — estimate bid-ask spread from serial covariance.
223
+
224
+ Spread = 2 * sqrt(-Cov(delta_price_t, delta_price_{t-1}))
225
+
226
+ Only valid when covariance is negative.
227
+ """
228
+ delta = price.diff()
229
+ cov = delta.rolling(window).cov(delta.shift(1))
230
+
231
+ # Roll's measure
232
+ roll = 2 * np.sqrt(np.maximum(-cov, 0))
233
+
234
+ return roll
235
+
236
+ @staticmethod
237
+ def hasbrouck_lambda(price: pd.Series, volume: pd.Series,
238
+ window: int = 100) -> pd.Series:
239
+ """
240
+ Hasbrouck's Lambda — information-based price impact.
241
+
242
+ Measures how much of the price change is due to information
243
+ vs. liquidity demand.
244
+ """
245
+ # Simplified: correlation of returns with lagged signed volume
246
+ returns = price.pct_change()
247
+ trade_sign = np.sign(price.diff().fillna(0))
248
+ signed_volume = trade_sign * volume
249
+
250
+ return returns.rolling(window).corr(signed_volume.shift(1))
251
+
252
+
253
+ def compute_all_microstructure_features(df: pd.DataFrame) -> pd.DataFrame:
254
+ """
255
+ Compute all microstructure features from a tick DataFrame.
256
+
257
+ Required columns: price, volume, bid, ask, bid_size, ask_size
258
+ """
259
+ required = ['price', 'volume', 'bid', 'ask', 'bid_size', 'ask_size']
260
+ for col in required:
261
+ if col not in df.columns:
262
+ raise ValueError(f"Missing required column: {col}")
263
+
264
+ features = pd.DataFrame(index=df.index)
265
+
266
+ # Basic spread
267
+ features['spread'] = MicrostructureFeatures.bid_ask_spread(df['bid'], df['ask'])
268
+ features['relative_spread'] = MicrostructureFeatures.relative_spread(
269
+ df['bid'], df['ask']
270
+ )
271
+
272
+ # Effective spread
273
+ features['effective_spread'] = MicrostructureFeatures.effective_spread(
274
+ df['price'], df['bid'], df['ask']
275
+ )
276
+
277
+ # Order flow imbalance
278
+ features['ofi'] = MicrostructureFeatures.order_flow_imbalance(
279
+ df['bid_size'], df['ask_size']
280
+ )
281
+
282
+ # Trade sign classification
283
+ features['trade_sign'] = MicrostructureFeatures.trade_sign_classification(
284
+ df['price'], df['bid'], df['ask']
285
+ )
286
+
287
+ # Signed volume
288
+ features['signed_volume'] = features['trade_sign'] * df['volume']
289
+ features['volume_imbalance'] = MicrostructureFeatures.volume_imbalance(
290
+ np.where(features['trade_sign'] > 0, df['volume'], 0),
291
+ np.where(features['trade_sign'] < 0, df['volume'], 0)
292
+ )
293
+
294
+ # Amihud illiquidity (using daily approximation from intraday)
295
+ features['amihud_illiquidity'] = MicrostructureFeatures.amihud_illiquidity(
296
+ df['price'], df['volume']
297
+ )
298
+
299
+ # Kyle's lambda
300
+ features['kyle_lambda'] = MicrostructureFeatures.kyles_lambda(
301
+ df['price'], df['volume'], features['trade_sign']
302
+ )
303
+
304
+ # Roll's measure
305
+ features['roll_measure'] = MicrostructureFeatures.roll_measure(df['price'])
306
+
307
+ # Hasbrouck lambda
308
+ features['hasbrouck_lambda'] = MicrostructureFeatures.hasbrouck_lambda(
309
+ df['price'], df['volume']
310
+ )
311
+
312
+ # VPIN (computed once, broadcast)
313
+ vpin = MicrostructureFeatures.vpin_approximation(df['price'], df['volume'])
314
+ features['vpin'] = vpin
315
+
316
+ return features.replace([np.inf, -np.inf], np.nan).fillna(method='ffill').fillna(0)
317
+
318
+
319
+ def generate_synthetic_tick_data(n_ticks: int = 10000,
320
+ base_price: float = 100.0,
321
+ volatility: float = 0.001,
322
+ spread_bps: float = 1.0) -> pd.DataFrame:
323
+ """
324
+ Generate synthetic tick-level data for testing microstructure features.
325
+ """
326
+ np.random.seed(42)
327
+
328
+ # Price process: random walk with slight mean reversion
329
+ prices = [base_price]
330
+ for _ in range(n_ticks - 1):
331
+ # Small random walk
332
+ change = np.random.randn() * volatility * base_price
333
+ # Mean reversion
334
+ change -= 0.01 * (prices[-1] - base_price)
335
+ prices.append(max(prices[-1] + change, 0.01))
336
+
337
+ prices = np.array(prices)
338
+
339
+ # Bid-ask spread
340
+ half_spread = prices * spread_bps / 20000 # bps to dollars
341
+ bid = prices - half_spread
342
+ ask = prices + half_spread
343
+
344
+ # Sizes (power law: few large orders, many small)
345
+ bid_size = np.random.lognormal(8, 1.5, n_ticks).astype(int)
346
+ ask_size = np.random.lognormal(8, 1.5, n_ticks).astype(int)
347
+
348
+ # Volume (trades happen at mid mostly)
349
+ volume = np.random.lognormal(6, 1.2, n_ticks).astype(int)
350
+
351
+ # Timestamp
352
+ times = pd.date_range('2024-01-01 09:30', periods=n_ticks, freq='1s')
353
+
354
+ return pd.DataFrame({
355
+ 'timestamp': times,
356
+ 'price': prices,
357
+ 'bid': bid,
358
+ 'ask': ask,
359
+ 'bid_size': bid_size,
360
+ 'ask_size': ask_size,
361
+ 'volume': volume
362
+ }).set_index('timestamp')
363
+
364
+
365
+ if __name__ == '__main__':
366
+ # Test microstructure features
367
+ tick_data = generate_synthetic_tick_data(n_ticks=5000)
368
+ features = compute_all_microstructure_features(tick_data)
369
+
370
+ print("Market Microstructure Features")
371
+ print("=" * 60)
372
+ print(f"\nDataset: {len(tick_data)} ticks")
373
+ print(f"Features computed: {len(features.columns)}")
374
+ print(f"\nFeature Summary:")
375
+ print(features.describe().round(6))
376
+
377
+ print(f"\nVPIN (Volume-Synchronized Probability of Informed Trading):")
378
+ print(f" {features['vpin'].iloc[0]:.4f}")
379
+
380
+ print(f"\nSample Features (last 5 ticks):")
381
+ print(features[['spread', 'relative_spread', 'ofi', 'kyle_lambda',
382
+ 'amihud_illiquidity']].tail().round(6))