Oliver Le commited on
Commit
d03866e
·
0 Parent(s):

Initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. .gitignore +1 -0
  3. HP_list.py +283 -0
  4. README.md +13 -0
  5. app.py +237 -0
  6. evaluation/.DS_Store +0 -0
  7. evaluation/__init__.py +1 -0
  8. evaluation/affiliation/__init__.py +1 -0
  9. evaluation/affiliation/__pycache__/__init__.cpython-310.pyc +0 -0
  10. evaluation/affiliation/__pycache__/__init__.cpython-311.pyc +0 -0
  11. evaluation/affiliation/__pycache__/__init__.cpython-38.pyc +0 -0
  12. evaluation/affiliation/__pycache__/__init__.cpython-39.pyc +0 -0
  13. evaluation/affiliation/__pycache__/_affiliation_zone.cpython-310.pyc +0 -0
  14. evaluation/affiliation/__pycache__/_affiliation_zone.cpython-311.pyc +0 -0
  15. evaluation/affiliation/__pycache__/_affiliation_zone.cpython-38.pyc +0 -0
  16. evaluation/affiliation/__pycache__/_affiliation_zone.cpython-39.pyc +0 -0
  17. evaluation/affiliation/__pycache__/_integral_interval.cpython-310.pyc +0 -0
  18. evaluation/affiliation/__pycache__/_integral_interval.cpython-311.pyc +0 -0
  19. evaluation/affiliation/__pycache__/_integral_interval.cpython-38.pyc +0 -0
  20. evaluation/affiliation/__pycache__/_integral_interval.cpython-39.pyc +0 -0
  21. evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-310.pyc +0 -0
  22. evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-311.pyc +0 -0
  23. evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-38.pyc +0 -0
  24. evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-39.pyc +0 -0
  25. evaluation/affiliation/__pycache__/generics.cpython-310.pyc +0 -0
  26. evaluation/affiliation/__pycache__/generics.cpython-311.pyc +0 -0
  27. evaluation/affiliation/__pycache__/generics.cpython-38.pyc +0 -0
  28. evaluation/affiliation/__pycache__/generics.cpython-39.pyc +0 -0
  29. evaluation/affiliation/__pycache__/metrics.cpython-310.pyc +0 -0
  30. evaluation/affiliation/__pycache__/metrics.cpython-311.pyc +0 -0
  31. evaluation/affiliation/__pycache__/metrics.cpython-38.pyc +0 -0
  32. evaluation/affiliation/__pycache__/metrics.cpython-39.pyc +0 -0
  33. evaluation/affiliation/_affiliation_zone.py +86 -0
  34. evaluation/affiliation/_integral_interval.py +464 -0
  35. evaluation/affiliation/_single_ground_truth_event.py +68 -0
  36. evaluation/affiliation/generics.py +135 -0
  37. evaluation/affiliation/metrics.py +116 -0
  38. evaluation/basic_metrics.py +0 -0
  39. evaluation/metrics.py +379 -0
  40. evaluation/visualize.py +99 -0
  41. model_wrapper.py +532 -0
  42. models/.DS_Store +0 -0
  43. models/AE.py +407 -0
  44. models/CBLOF.py +332 -0
  45. models/CNN.py +273 -0
  46. models/COF.py +211 -0
  47. models/COPOD.py +205 -0
  48. models/Chronos.py +94 -0
  49. models/DADA.py +141 -0
  50. models/Donut.py +419 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ models/granite_tsfm
HP_list.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Multi_algo_HP_dict = {
2
+ 'IForest': {
3
+ 'n_estimators': [25, 50, 100, 150, 200],
4
+ 'max_features': [0.2, 0.4, 0.6, 0.8, 1.0]
5
+ },
6
+ 'LOF': {
7
+ 'n_neighbors': [10, 20, 30, 40, 50],
8
+ 'metric': ['minkowski', 'manhattan', 'euclidean']
9
+ },
10
+ 'PCA': {
11
+ 'n_components': [0.25, 0.5, 0.75, None]
12
+ },
13
+ 'HBOS': {
14
+ 'n_bins': [5, 10, 20, 30, 40],
15
+ 'tol': [0.1, 0.3, 0.5, 0.7]
16
+ },
17
+ 'OCSVM': {
18
+ 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
19
+ 'nu': [0.1, 0.3, 0.5, 0.7]
20
+ },
21
+ 'MCD': {
22
+ 'support_fraction': [0.2, 0.4, 0.6, 0.8, None]
23
+ },
24
+ 'KNN': {
25
+ 'n_neighbors': [10, 20, 30, 40, 50],
26
+ 'method': ['largest', 'mean', 'median']
27
+ },
28
+ 'KMeansAD': {
29
+ 'n_clusters': [10, 20, 30, 40],
30
+ 'window_size': [10, 20, 30, 40]
31
+ },
32
+ 'COPOD': {
33
+ 'HP': [None]
34
+ },
35
+ 'CBLOF': {
36
+ 'n_clusters': [4, 8, 16, 32],
37
+ 'alpha': [0.6, 0.7, 0.8, 0.9]
38
+ },
39
+ 'EIF': {
40
+ 'n_trees': [25, 50, 100, 200]
41
+ },
42
+ 'RobustPCA': {
43
+ 'max_iter': [500, 1000, 1500]
44
+ },
45
+ 'AutoEncoder': {
46
+ 'hidden_neurons': [[64, 32], [32, 16], [128, 64]]
47
+ },
48
+ 'CNN': {
49
+ 'window_size': [50, 100, 150],
50
+ 'num_channel': [[32, 32, 40], [16, 32, 64]]
51
+ },
52
+ 'LSTMAD': {
53
+ 'window_size': [50, 100, 150],
54
+ 'lr': [0.0004, 0.0008]
55
+ },
56
+ 'TranAD': {
57
+ 'win_size': [5, 10, 50],
58
+ 'lr': [1e-3, 1e-4]
59
+ },
60
+ 'AnomalyTransformer': {
61
+ 'win_size': [50, 100, 150],
62
+ 'lr': [1e-3, 1e-4, 1e-5]
63
+ },
64
+ 'OmniAnomaly': {
65
+ 'win_size': [5, 50, 100],
66
+ 'lr': [0.002, 0.0002]
67
+ },
68
+ 'USAD': {
69
+ 'win_size': [5, 50, 100],
70
+ 'lr': [1e-3, 1e-4, 1e-5]
71
+ },
72
+ 'Donut': {
73
+ 'win_size': [60, 90, 120],
74
+ 'lr': [1e-3, 1e-4, 1e-5]
75
+ },
76
+ 'TimesNet': {
77
+ 'win_size': [32, 96, 192],
78
+ 'lr': [1e-3, 1e-4, 1e-5]
79
+ },
80
+ 'FITS': {
81
+ 'win_size': [100, 200],
82
+ 'lr': [1e-3, 1e-4, 1e-5]
83
+ },
84
+ 'OFA': {
85
+ 'win_size': [50, 100, 150]
86
+ },
87
+ 'Time_RCD': {
88
+ 'win_size': 7000
89
+ },
90
+ 'TSPulse': {
91
+ 'win_size': [64, 128, 256],
92
+ 'batch_size': [32, 64, 128],
93
+ 'aggregation_length': [32, 64, 128],
94
+ 'aggr_function': ['max', 'mean'],
95
+ 'smoothing_length': [4, 8, 16]
96
+ }
97
+ }
98
+
99
+
100
+ Optimal_Multi_algo_HP_dict = {
101
+ 'IForest': {'n_estimators': 25, 'max_features': 0.8},
102
+ 'LOF': {'n_neighbors': 50, 'metric': 'euclidean'},
103
+ 'PCA': {'n_components': 0.25},
104
+ 'HBOS': {'n_bins': 30, 'tol': 0.5},
105
+ 'OCSVM': {'kernel': 'rbf', 'nu': 0.1},
106
+ 'MCD': {'support_fraction': 0.8},
107
+ 'KNN': {'n_neighbors': 50, 'method': 'mean'},
108
+ 'KMeansAD': {'n_clusters': 10, 'window_size': 40},
109
+ 'KShapeAD': {'n_clusters': 20, 'window_size': 40},
110
+ 'COPOD': {'n_jobs':1},
111
+ 'CBLOF': {'n_clusters': 4, 'alpha': 0.6},
112
+ 'EIF': {'n_trees': 50},
113
+ 'RobustPCA': {'max_iter': 1000},
114
+ 'AutoEncoder': {'hidden_neurons': [128, 64]},
115
+ 'CNN': {'window_size': 50, 'num_channel': [32, 32, 40]},
116
+ 'LSTMAD': {'window_size': 150, 'lr': 0.0008},
117
+ 'TranAD': {'win_size': 10, 'lr': 0.001},
118
+ 'AnomalyTransformer': {'win_size': 50, 'lr': 0.001},
119
+ 'OmniAnomaly': {'win_size': 100, 'lr': 0.002},
120
+ 'USAD': {'win_size': 100, 'lr': 0.001},
121
+ 'Donut': {'win_size': 60, 'lr': 0.001},
122
+ 'TimesNet': {'win_size': 96, 'lr': 0.0001},
123
+ 'FITS': {'win_size': 100, 'lr': 0.001},
124
+ 'OFA': {'win_size': 50},
125
+ 'Time_RCD': {'win_size':5000, 'batch_size': 1},
126
+ 'DADA': {'win_size': 100, 'batch_size': 64},
127
+ 'TSPulse': {'win_size': 96 , 'batch_size': 64, 'aggregation_length': 64, 'aggr_function': 'max', 'smoothing_length': 8}
128
+ }
129
+
130
+
131
+ Uni_algo_HP_dict = {
132
+ 'Sub_IForest': {
133
+ 'periodicity': [1, 2, 3],
134
+ 'n_estimators': [25, 50, 100, 150, 200]
135
+ },
136
+ 'IForest': {
137
+ 'n_estimators': [25, 50, 100, 150, 200]
138
+ },
139
+ 'Sub_LOF': {
140
+ 'periodicity': [1, 2, 3],
141
+ 'n_neighbors': [10, 20, 30, 40, 50]
142
+ },
143
+ 'LOF': {
144
+ 'n_neighbors': [10, 20, 30, 40, 50]
145
+ },
146
+ 'POLY': {
147
+ 'periodicity': [1, 2, 3],
148
+ 'power': [1, 2, 3, 4]
149
+ },
150
+ 'MatrixProfile': {
151
+ 'periodicity': [1, 2, 3]
152
+ },
153
+ 'NORMA': {
154
+ 'periodicity': [1, 2, 3],
155
+ 'clustering': ['hierarchical', 'kshape']
156
+ },
157
+ 'SAND': {
158
+ 'periodicity': [1, 2, 3]
159
+ },
160
+ 'Series2Graph': {
161
+ 'periodicity': [1, 2, 3]
162
+ },
163
+ 'Sub_PCA': {
164
+ 'periodicity': [1, 2, 3],
165
+ 'n_components': [0.25, 0.5, 0.75, None]
166
+ },
167
+ 'Sub_HBOS': {
168
+ 'periodicity': [1, 2, 3],
169
+ 'n_bins': [5, 10, 20, 30, 40]
170
+ },
171
+ 'Sub_OCSVM': {
172
+ 'periodicity': [1, 2, 3],
173
+ 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
174
+ },
175
+ 'Sub_MCD': {
176
+ 'periodicity': [1, 2, 3],
177
+ 'support_fraction': [0.2, 0.4, 0.6, 0.8, None]
178
+ },
179
+ 'Sub_KNN': {
180
+ 'periodicity': [1, 2, 3],
181
+ 'n_neighbors': [10, 20, 30, 40, 50],
182
+ },
183
+ 'KMeansAD_U': {
184
+ 'periodicity': [1, 2, 3],
185
+ 'n_clusters': [10, 20, 30, 40],
186
+ },
187
+ 'KShapeAD': {
188
+ 'periodicity': [1, 2, 3]
189
+ },
190
+ 'AutoEncoder': {
191
+ 'window_size': [50, 100, 150],
192
+ 'hidden_neurons': [[64, 32], [32, 16], [128, 64]]
193
+ },
194
+ 'CNN': {
195
+ 'window_size': [50, 100, 150],
196
+ 'num_channel': [[32, 32, 40], [16, 32, 64]]
197
+ },
198
+ 'LSTMAD': {
199
+ 'window_size': [50, 100, 150],
200
+ 'lr': [0.0004, 0.0008]
201
+ },
202
+ 'TranAD': {
203
+ 'win_size': [5, 10, 50],
204
+ 'lr': [1e-3, 1e-4]
205
+ },
206
+ 'AnomalyTransformer': {
207
+ 'win_size': [50, 100, 150],
208
+ 'lr': [1e-3, 1e-4, 1e-5]
209
+ },
210
+ 'OmniAnomaly': {
211
+ 'win_size': [5, 50, 100],
212
+ 'lr': [0.002, 0.0002]
213
+ },
214
+ 'USAD': {
215
+ 'win_size': [5, 50, 100],
216
+ 'lr': [1e-3, 1e-4, 1e-5]
217
+ },
218
+ 'Donut': {
219
+ 'win_size': [60, 90, 120],
220
+ 'lr': [1e-3, 1e-4, 1e-5]
221
+ },
222
+ 'TimesNet': {
223
+ 'win_size': [32, 96, 192],
224
+ 'lr': [1e-3, 1e-4, 1e-5]
225
+ },
226
+ 'FITS': {
227
+ 'win_size': [100, 200],
228
+ 'lr': [1e-3, 1e-4, 1e-5]
229
+ },
230
+ 'OFA': {
231
+ 'win_size': [50, 100, 150]
232
+ },
233
+ # 'Time_RCD': {
234
+ # 'win_size': [1000, 2000, 3000, 4000, 5000, 6000, 8000, 10000],
235
+ # 'batch_size': [32, 64, 128]
236
+ # }
237
+ }
238
+
239
+ Optimal_Uni_algo_HP_dict = {
240
+ 'Sub_IForest': {'periodicity': 1, 'n_estimators': 150},
241
+ 'IForest': {'n_estimators': 200},
242
+ 'Sub_LOF': {'periodicity': 2, 'n_neighbors': 30},
243
+ 'LOF': {'n_neighbors': 50},
244
+ 'POLY': {'periodicity': 1, 'power': 4},
245
+ 'MatrixProfile': {'periodicity': 1},
246
+ 'NORMA': {'periodicity': 1, 'clustering': 'kshape'},
247
+ 'SAND': {'periodicity': 1},
248
+ 'Series2Graph': {'periodicity': 1},
249
+ 'SR': {'periodicity': 1},
250
+ 'Sub_PCA': {'periodicity': 1, 'n_components': None},
251
+ 'Sub_HBOS': {'periodicity': 1, 'n_bins': 10},
252
+ 'Sub_OCSVM': {'periodicity': 2, 'kernel': 'rbf'},
253
+ 'Sub_MCD': {'periodicity': 3, 'support_fraction': None},
254
+ 'Sub_KNN': {'periodicity': 2, 'n_neighbors': 50},
255
+ 'KMeansAD_U': {'periodicity': 2, 'n_clusters': 10},
256
+ 'KShapeAD': {'periodicity': 1},
257
+ 'FFT': {},
258
+ 'Left_STAMPi': {},
259
+ 'AutoEncoder': {'window_size': 100, 'hidden_neurons': [128, 64]},
260
+ 'CNN': {'window_size': 50, 'num_channel': [32, 32, 40]},
261
+ 'LSTMAD': {'window_size': 100, 'lr': 0.0008},
262
+ 'TranAD': {'win_size': 10, 'lr': 0.0001},
263
+ 'AnomalyTransformer': {'win_size': 50, 'lr': 0.001},
264
+ 'OmniAnomaly': {'win_size': 5, 'lr': 0.002},
265
+ 'USAD': {'win_size': 100, 'lr': 0.001},
266
+ 'Donut': {'win_size': 60, 'lr': 0.0001},
267
+ 'TimesNet': {'win_size': 32, 'lr': 0.0001},
268
+ 'FITS': {'win_size': 100, 'lr': 0.0001},
269
+ 'OFA': {'win_size': 50},
270
+ 'Lag_Llama': {'win_size': 96},
271
+ 'Chronos': {'win_size': 100},
272
+ 'TimesFM': {'win_size': 96},
273
+ 'MOMENT_ZS': {'win_size': 64},
274
+ 'MOMENT_FT': {'win_size': 64},
275
+ 'M2N2': {},
276
+ 'DADA': {'win_size': 100},
277
+ 'Time_MOE': {'win_size':96},
278
+ 'Time_RCD': {'win_size':5000, 'batch_size': 64},
279
+ 'Time_RCD_Reconstruction': {'win_size':5000, 'batch_size': 128},
280
+ 'Time_RCD_Reconstruction_Anomaly_Head': {'win_size':5000, 'batch_size': 128},
281
+ 'Time_RCD_Reconstruction_Random_Mask_Anomaly_Head': {'win_size':5000, 'batch_size': 128},
282
+ 'TSPulse': {'win_size':96, 'batch_size': 64, 'aggregation_length': 64, 'aggr_function': 'max', 'smoothing_length': 8}
283
+ }
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Time RCD
3
+ emoji: 🐠
4
+ colorFrom: purple
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.49.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import zipfile
3
+ from pathlib import Path
4
+ from typing import List, Tuple
5
+
6
+ import gradio as gr
7
+ import matplotlib
8
+
9
+ matplotlib.use("Agg")
10
+ import matplotlib.pyplot as plt
11
+ import numpy as np
12
+ import pandas as pd
13
+ from huggingface_hub import HfHubHTTPError, hf_hub_download
14
+
15
+ from model_wrapper import run_Time_RCD
16
+
17
+ REPO_ID = "thu-sail-lab/Time-RCD"
18
+
19
+ CHECKPOINT_FILES = [
20
+ "checkpoints/full_mask_anomaly_head_pretrain_checkpoint_best.pth",
21
+ "checkpoints/dataset_10_20.pth",
22
+ "checkpoints/full_mask_10_20.pth",
23
+ "checkpoints/dataset_15_56.pth",
24
+ "checkpoints/full_mask_15_56.pth",
25
+ ]
26
+
27
+
28
+ def ensure_checkpoints() -> None:
29
+ """Ensure that the required checkpoint files are present locally."""
30
+ missing = [path for path in CHECKPOINT_FILES if not Path(path).exists()]
31
+ if not missing:
32
+ return
33
+
34
+ try:
35
+ zip_path = hf_hub_download(
36
+ repo_id=REPO_ID,
37
+ filename="checkpoints.zip",
38
+ repo_type="model",
39
+ cache_dir=".cache/hf",
40
+ )
41
+ except HfHubHTTPError:
42
+ zip_path = hf_hub_download(
43
+ repo_id=REPO_ID,
44
+ filename="checkpoints.zip",
45
+ repo_type="dataset",
46
+ cache_dir=".cache/hf",
47
+ )
48
+
49
+ with zipfile.ZipFile(zip_path, "r") as zf:
50
+ zf.extractall(".")
51
+
52
+
53
+ def load_timeseries(file_obj, feature_columns: List[str] | None = None) -> Tuple[pd.DataFrame, np.ndarray]:
54
+ """Load the uploaded file into a numeric dataframe and numpy array."""
55
+ path = Path(file_obj.name)
56
+ if path.suffix.lower() == ".npy":
57
+ data = np.load(path, allow_pickle=False)
58
+ if data.ndim == 1:
59
+ data = data.reshape(-1, 1)
60
+ if not isinstance(data, np.ndarray):
61
+ raise ValueError("Loaded data is not a numpy array.")
62
+ df = pd.DataFrame(data)
63
+ return df, data.astype(np.float32)
64
+
65
+ if path.suffix.lower() not in {".csv", ".txt"}:
66
+ raise ValueError("Unsupported file type. Please upload a .csv, .txt, or .npy file.")
67
+
68
+ df = pd.read_csv(path)
69
+ numeric_df = df.select_dtypes(include=np.number)
70
+ if numeric_df.empty:
71
+ raise ValueError("No numeric columns detected. Ensure your file contains numeric values.")
72
+
73
+ if feature_columns:
74
+ missing = [col for col in feature_columns if col not in numeric_df.columns]
75
+ if missing:
76
+ raise ValueError(f"Selected columns not found in the file: {', '.join(missing)}")
77
+ numeric_df = numeric_df[feature_columns]
78
+
79
+ array = numeric_df.to_numpy(dtype=np.float32)
80
+ if array.ndim == 1:
81
+ array = array.reshape(-1, 1)
82
+
83
+ return numeric_df, array
84
+
85
+
86
+ def infer(
87
+ file_obj,
88
+ is_multivariate: bool,
89
+ window_size: int,
90
+ batch_size: int,
91
+ mask_type: str,
92
+ multi_size: str,
93
+ feature_columns: List[str],
94
+ ) -> Tuple[str, pd.DataFrame, plt.Figure]:
95
+ """Run Time-RCD inference and produce outputs for the Gradio UI."""
96
+ ensure_checkpoints()
97
+ numeric_df, array = load_timeseries(file_obj, feature_columns or None)
98
+
99
+ kwargs = {
100
+ "Multi": is_multivariate,
101
+ "win_size": window_size,
102
+ "batch_size": batch_size,
103
+ "random_mask": mask_type,
104
+ "size": multi_size,
105
+ "device": "cpu",
106
+ }
107
+
108
+ scores, logits = run_Time_RCD(array, **kwargs)
109
+ score_vector = np.asarray(scores).reshape(-1)
110
+ logit_vector = np.asarray(logits).reshape(-1)
111
+
112
+ valid_length = min(len(score_vector), len(numeric_df))
113
+ score_series = pd.Series(score_vector[:valid_length], index=numeric_df.index[:valid_length], name="anomaly_score")
114
+ logit_series = pd.Series(logit_vector[:valid_length], index=numeric_df.index[:valid_length], name="anomaly_logit")
115
+
116
+ result_df = numeric_df.iloc[:valid_length, :].copy()
117
+ result_df["anomaly_score"] = score_series
118
+ result_df["anomaly_logit"] = logit_series
119
+
120
+ top_indices = score_series.nlargest(5).index.tolist()
121
+ highlight_message = (
122
+ "Top anomaly indices (by score): " + ", ".join(str(idx) for idx in top_indices)
123
+ if len(top_indices) > 0
124
+ else "No anomalies detected."
125
+ )
126
+
127
+ figure = build_plot(result_df)
128
+
129
+ return highlight_message, result_df, figure
130
+
131
+
132
+ def build_plot(result_df: pd.DataFrame) -> plt.Figure:
133
+ """Create a matplotlib plot of the first feature vs. anomaly score."""
134
+ fig, ax_primary = plt.subplots(figsize=(10, 4))
135
+ index = result_df.index
136
+ feature_cols = [col for col in result_df.columns if col not in {"anomaly_score", "anomaly_logit"}]
137
+
138
+ primary_col = feature_cols[0]
139
+ ax_primary.plot(index, result_df[primary_col], label=f"{primary_col}", color="#1f77b4", linewidth=1.0)
140
+ ax_primary.set_xlabel("Index")
141
+ ax_primary.set_ylabel("Value")
142
+ ax_primary.grid(alpha=0.2)
143
+
144
+ ax_secondary = ax_primary.twinx()
145
+ ax_secondary.plot(index, result_df["anomaly_score"], label="Anomaly Score", color="#d62728", linewidth=1.0)
146
+ ax_secondary.set_ylabel("Anomaly Score")
147
+
148
+ fig.tight_layout()
149
+ return fig
150
+
151
+
152
+ def build_interface() -> gr.Blocks:
153
+ """Define the Gradio UI."""
154
+ with gr.Blocks(title="Time-RCD Zero-Shot Anomaly Detection") as demo:
155
+ gr.Markdown(
156
+ "# Time-RCD Zero-Shot Anomaly Detection\n"
157
+ "Upload a time series to run zero-shot anomaly detection with the pretrained Time-RCD checkpoints. "
158
+ "You can choose univariate or multivariate mode, adjust the window size, and configure mask settings."
159
+ )
160
+
161
+ with gr.Row():
162
+ file_input = gr.File(label="Upload time series file (.csv, .txt, .npy)", file_types=[".csv", ".txt", ".npy"])
163
+ column_selector = gr.Textbox(
164
+ label="Columns to use (comma-separated, optional)",
165
+ placeholder="e.g. value,feature_1,feature_2",
166
+ )
167
+
168
+ with gr.Row():
169
+ multivariate = gr.Radio(
170
+ choices=["Univariate", "Multivariate"],
171
+ value="Univariate",
172
+ label="Data type",
173
+ )
174
+ window_size_in = gr.Slider(
175
+ minimum=128,
176
+ maximum=8192,
177
+ value=2048,
178
+ step=128,
179
+ label="Window size",
180
+ )
181
+ batch_size_in = gr.Slider(
182
+ minimum=1,
183
+ maximum=128,
184
+ value=16,
185
+ step=1,
186
+ label="Batch size",
187
+ )
188
+
189
+ with gr.Row():
190
+ mask_type_in = gr.Radio(
191
+ choices=["random_mask", "full_mask"],
192
+ value="random_mask",
193
+ label="Mask type (multivariate only)",
194
+ )
195
+ multi_size_in = gr.Radio(
196
+ choices=["full", "small"],
197
+ value="full",
198
+ label="Multivariate model size",
199
+ )
200
+
201
+ run_button = gr.Button("Run Inference", variant="primary")
202
+
203
+ result_message = gr.Textbox(label="Summary", interactive=False)
204
+ result_dataframe = gr.DataFrame(label="Anomaly Scores", interactive=False)
205
+ plot_output = gr.Plot(label="Series vs. Anomaly Score")
206
+
207
+ def _submit(file_obj, multivariate_choice, win, batch, mask, size, columns_text):
208
+ if file_obj is None:
209
+ raise gr.Error("Please upload a time series file.")
210
+
211
+ feature_columns = [col.strip() for col in columns_text.split(",") if col.strip()] if columns_text else []
212
+ is_multi = multivariate_choice == "Multivariate"
213
+ summary, df, fig = infer(
214
+ file_obj=file_obj,
215
+ is_multivariate=is_multi,
216
+ window_size=int(win),
217
+ batch_size=int(batch),
218
+ mask_type=mask,
219
+ multi_size=size,
220
+ feature_columns=feature_columns,
221
+ )
222
+ return summary, df, fig
223
+
224
+ run_button.click(
225
+ fn=_submit,
226
+ inputs=[file_input, multivariate, window_size_in, batch_size_in, mask_type_in, multi_size_in, column_selector],
227
+ outputs=[result_message, result_dataframe, plot_output],
228
+ )
229
+
230
+ return demo
231
+
232
+
233
+ demo = build_interface()
234
+
235
+ if __name__ == "__main__":
236
+ demo.launch()
237
+
evaluation/.DS_Store ADDED
Binary file (6.15 kB). View file
 
evaluation/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
evaluation/affiliation/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
evaluation/affiliation/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (150 Bytes). View file
 
evaluation/affiliation/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (180 Bytes). View file
 
evaluation/affiliation/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (162 Bytes). View file
 
evaluation/affiliation/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (162 Bytes). View file
 
evaluation/affiliation/__pycache__/_affiliation_zone.cpython-310.pyc ADDED
Binary file (4.27 kB). View file
 
evaluation/affiliation/__pycache__/_affiliation_zone.cpython-311.pyc ADDED
Binary file (5.92 kB). View file
 
evaluation/affiliation/__pycache__/_affiliation_zone.cpython-38.pyc ADDED
Binary file (4.31 kB). View file
 
evaluation/affiliation/__pycache__/_affiliation_zone.cpython-39.pyc ADDED
Binary file (4.28 kB). View file
 
evaluation/affiliation/__pycache__/_integral_interval.cpython-310.pyc ADDED
Binary file (12.2 kB). View file
 
evaluation/affiliation/__pycache__/_integral_interval.cpython-311.pyc ADDED
Binary file (17.9 kB). View file
 
evaluation/affiliation/__pycache__/_integral_interval.cpython-38.pyc ADDED
Binary file (12.2 kB). View file
 
evaluation/affiliation/__pycache__/_integral_interval.cpython-39.pyc ADDED
Binary file (12.2 kB). View file
 
evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-310.pyc ADDED
Binary file (3.99 kB). View file
 
evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-311.pyc ADDED
Binary file (5.88 kB). View file
 
evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-38.pyc ADDED
Binary file (4.07 kB). View file
 
evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-39.pyc ADDED
Binary file (4.03 kB). View file
 
evaluation/affiliation/__pycache__/generics.cpython-310.pyc ADDED
Binary file (5.93 kB). View file
 
evaluation/affiliation/__pycache__/generics.cpython-311.pyc ADDED
Binary file (8.67 kB). View file
 
evaluation/affiliation/__pycache__/generics.cpython-38.pyc ADDED
Binary file (6.05 kB). View file
 
evaluation/affiliation/__pycache__/generics.cpython-39.pyc ADDED
Binary file (6.05 kB). View file
 
evaluation/affiliation/__pycache__/metrics.cpython-310.pyc ADDED
Binary file (4.7 kB). View file
 
evaluation/affiliation/__pycache__/metrics.cpython-311.pyc ADDED
Binary file (7.73 kB). View file
 
evaluation/affiliation/__pycache__/metrics.cpython-38.pyc ADDED
Binary file (4.79 kB). View file
 
evaluation/affiliation/__pycache__/metrics.cpython-39.pyc ADDED
Binary file (4.76 kB). View file
 
evaluation/affiliation/_affiliation_zone.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ from ._integral_interval import interval_intersection
4
+
5
+ def t_start(j, Js = [(1,2),(3,4),(5,6)], Trange = (1,10)):
6
+ """
7
+ Helper for `E_gt_func`
8
+
9
+ :param j: index from 0 to len(Js) (included) on which to get the start
10
+ :param Js: ground truth events, as a list of couples
11
+ :param Trange: range of the series where Js is included
12
+ :return: generalized start such that the middle of t_start and t_stop
13
+ always gives the affiliation zone
14
+ """
15
+ b = max(Trange)
16
+ n = len(Js)
17
+ if j == n:
18
+ return(2*b - t_stop(n-1, Js, Trange))
19
+ else:
20
+ return(Js[j][0])
21
+
22
+ def t_stop(j, Js = [(1,2),(3,4),(5,6)], Trange = (1,10)):
23
+ """
24
+ Helper for `E_gt_func`
25
+
26
+ :param j: index from 0 to len(Js) (included) on which to get the stop
27
+ :param Js: ground truth events, as a list of couples
28
+ :param Trange: range of the series where Js is included
29
+ :return: generalized stop such that the middle of t_start and t_stop
30
+ always gives the affiliation zone
31
+ """
32
+ if j == -1:
33
+ a = min(Trange)
34
+ return(2*a - t_start(0, Js, Trange))
35
+ else:
36
+ return(Js[j][1])
37
+
38
+ def E_gt_func(j, Js, Trange):
39
+ """
40
+ Get the affiliation zone of element j of the ground truth
41
+
42
+ :param j: index from 0 to len(Js) (excluded) on which to get the zone
43
+ :param Js: ground truth events, as a list of couples
44
+ :param Trange: range of the series where Js is included, can
45
+ be (-math.inf, math.inf) for distance measures
46
+ :return: affiliation zone of element j of the ground truth represented
47
+ as a couple
48
+ """
49
+ range_left = (t_stop(j-1, Js, Trange) + t_start(j, Js, Trange))/2
50
+ range_right = (t_stop(j, Js, Trange) + t_start(j+1, Js, Trange))/2
51
+ return((range_left, range_right))
52
+
53
+ def get_all_E_gt_func(Js, Trange):
54
+ """
55
+ Get the affiliation partition from the ground truth point of view
56
+
57
+ :param Js: ground truth events, as a list of couples
58
+ :param Trange: range of the series where Js is included, can
59
+ be (-math.inf, math.inf) for distance measures
60
+ :return: affiliation partition of the events
61
+ """
62
+ # E_gt is the limit of affiliation/attraction for each ground truth event
63
+ E_gt = [E_gt_func(j, Js, Trange) for j in range(len(Js))]
64
+ return(E_gt)
65
+
66
+ def affiliation_partition(Is = [(1,1.5),(2,5),(5,6),(8,9)], E_gt = [(1,2.5),(2.5,4.5),(4.5,10)]):
67
+ """
68
+ Cut the events into the affiliation zones
69
+ The presentation given here is from the ground truth point of view,
70
+ but it is also used in the reversed direction in the main function.
71
+
72
+ :param Is: events as a list of couples
73
+ :param E_gt: range of the affiliation zones
74
+ :return: a list of list of intervals (each interval represented by either
75
+ a couple or None for empty interval). The outer list is indexed by each
76
+ affiliation zone of `E_gt`. The inner list is indexed by the events of `Is`.
77
+ """
78
+ out = [None] * len(E_gt)
79
+ for j in range(len(E_gt)):
80
+ E_gt_j = E_gt[j]
81
+ discarded_idx_before = [I[1] < E_gt_j[0] for I in Is] # end point of predicted I is before the begin of E
82
+ discarded_idx_after = [I[0] > E_gt_j[1] for I in Is] # start of predicted I is after the end of E
83
+ kept_index = [not(a or b) for a, b in zip(discarded_idx_before, discarded_idx_after)]
84
+ Is_j = [x for x, y in zip(Is, kept_index)]
85
+ out[j] = [interval_intersection(I, E_gt[j]) for I in Is_j]
86
+ return(out)
evaluation/affiliation/_integral_interval.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ import math
4
+ from .generics import _sum_wo_nan
5
+ """
6
+ In order to shorten the length of the variables,
7
+ the general convention in this file is to let:
8
+ - I for a predicted event (start, stop),
9
+ - Is for a list of predicted events,
10
+ - J for a ground truth event,
11
+ - Js for a list of ground truth events.
12
+ """
13
+
14
+ def interval_length(J = (1,2)):
15
+ """
16
+ Length of an interval
17
+
18
+ :param J: couple representating the start and stop of an interval, or None
19
+ :return: length of the interval, and 0 for a None interval
20
+ """
21
+ if J is None:
22
+ return(0)
23
+ return(J[1] - J[0])
24
+
25
+ def sum_interval_lengths(Is = [(1,2),(3,4),(5,6)]):
26
+ """
27
+ Sum of length of the intervals
28
+
29
+ :param Is: list of intervals represented by starts and stops
30
+ :return: sum of the interval length
31
+ """
32
+ return(sum([interval_length(I) for I in Is]))
33
+
34
+ def interval_intersection(I = (1, 3), J = (2, 4)):
35
+ """
36
+ Intersection between two intervals I and J
37
+ I and J should be either empty or represent a positive interval (no point)
38
+
39
+ :param I: an interval represented by start and stop
40
+ :param J: a second interval of the same form
41
+ :return: an interval representing the start and stop of the intersection (or None if empty)
42
+ """
43
+ if I is None:
44
+ return(None)
45
+ if J is None:
46
+ return(None)
47
+
48
+ I_inter_J = (max(I[0], J[0]), min(I[1], J[1]))
49
+ if I_inter_J[0] >= I_inter_J[1]:
50
+ return(None)
51
+ else:
52
+ return(I_inter_J)
53
+
54
+ def interval_subset(I = (1, 3), J = (0, 6)):
55
+ """
56
+ Checks whether I is a subset of J
57
+
58
+ :param I: an non empty interval represented by start and stop
59
+ :param J: a second non empty interval of the same form
60
+ :return: True if I is a subset of J
61
+ """
62
+ if (I[0] >= J[0]) and (I[1] <= J[1]):
63
+ return True
64
+ else:
65
+ return False
66
+
67
+ def cut_into_three_func(I, J):
68
+ """
69
+ Cut an interval I into a partition of 3 subsets:
70
+ the elements before J,
71
+ the elements belonging to J,
72
+ and the elements after J
73
+
74
+ :param I: an interval represented by start and stop, or None for an empty one
75
+ :param J: a non empty interval
76
+ :return: a triplet of three intervals, each represented by either (start, stop) or None
77
+ """
78
+ if I is None:
79
+ return((None, None, None))
80
+
81
+ I_inter_J = interval_intersection(I, J)
82
+ if I == I_inter_J:
83
+ I_before = None
84
+ I_after = None
85
+ elif I[1] <= J[0]:
86
+ I_before = I
87
+ I_after = None
88
+ elif I[0] >= J[1]:
89
+ I_before = None
90
+ I_after = I
91
+ elif (I[0] <= J[0]) and (I[1] >= J[1]):
92
+ I_before = (I[0], I_inter_J[0])
93
+ I_after = (I_inter_J[1], I[1])
94
+ elif I[0] <= J[0]:
95
+ I_before = (I[0], I_inter_J[0])
96
+ I_after = None
97
+ elif I[1] >= J[1]:
98
+ I_before = None
99
+ I_after = (I_inter_J[1], I[1])
100
+ else:
101
+ raise ValueError('unexpected unconsidered case')
102
+ return(I_before, I_inter_J, I_after)
103
+
104
+ def get_pivot_j(I, J):
105
+ """
106
+ Get the single point of J that is the closest to I, called 'pivot' here,
107
+ with the requirement that I should be outside J
108
+
109
+ :param I: a non empty interval (start, stop)
110
+ :param J: another non empty interval, with empty intersection with I
111
+ :return: the element j of J that is the closest to I
112
+ """
113
+ if interval_intersection(I, J) is not None:
114
+ raise ValueError('I and J should have a void intersection')
115
+
116
+ j_pivot = None # j_pivot is a border of J
117
+ if max(I) <= min(J):
118
+ j_pivot = min(J)
119
+ elif min(I) >= max(J):
120
+ j_pivot = max(J)
121
+ else:
122
+ raise ValueError('I should be outside J')
123
+ return(j_pivot)
124
+
125
+ def integral_mini_interval(I, J):
126
+ """
127
+ In the specific case where interval I is located outside J,
128
+ integral of distance from x to J over the interval x \in I.
129
+ This is the *integral* i.e. the sum.
130
+ It's not the mean (not divided by the length of I yet)
131
+
132
+ :param I: a interval (start, stop), or None
133
+ :param J: a non empty interval, with empty intersection with I
134
+ :return: the integral of distances d(x, J) over x \in I
135
+ """
136
+ if I is None:
137
+ return(0)
138
+
139
+ j_pivot = get_pivot_j(I, J)
140
+ a = min(I)
141
+ b = max(I)
142
+ return((b-a)*abs((j_pivot - (a+b)/2)))
143
+
144
+ def integral_interval_distance(I, J):
145
+ """
146
+ For any non empty intervals I, J, compute the
147
+ integral of distance from x to J over the interval x \in I.
148
+ This is the *integral* i.e. the sum.
149
+ It's not the mean (not divided by the length of I yet)
150
+ The interval I can intersect J or not
151
+
152
+ :param I: a interval (start, stop), or None
153
+ :param J: a non empty interval
154
+ :return: the integral of distances d(x, J) over x \in I
155
+ """
156
+ # I and J are single intervals (not generic sets)
157
+ # I is a predicted interval in the range of affiliation of J
158
+
159
+ def f(I_cut):
160
+ return(integral_mini_interval(I_cut, J))
161
+ # If I_middle is fully included into J, it is
162
+ # the distance to J is always 0
163
+ def f0(I_middle):
164
+ return(0)
165
+
166
+ cut_into_three = cut_into_three_func(I, J)
167
+ # Distance for now, not the mean:
168
+ # Distance left: Between cut_into_three[0] and the point min(J)
169
+ d_left = f(cut_into_three[0])
170
+ # Distance middle: Between cut_into_three[1] = I inter J, and J
171
+ d_middle = f0(cut_into_three[1])
172
+ # Distance right: Between cut_into_three[2] and the point max(J)
173
+ d_right = f(cut_into_three[2])
174
+ # It's an integral so summable
175
+ return(d_left + d_middle + d_right)
176
+
177
+ def integral_mini_interval_P_CDFmethod__min_piece(I, J, E):
178
+ """
179
+ Helper of `integral_mini_interval_Pprecision_CDFmethod`
180
+ In the specific case where interval I is located outside J,
181
+ compute the integral $\int_{d_min}^{d_max} \min(m, x) dx$, with:
182
+ - m the smallest distance from J to E,
183
+ - d_min the smallest distance d(x, J) from x \in I to J
184
+ - d_max the largest distance d(x, J) from x \in I to J
185
+
186
+ :param I: a single predicted interval, a non empty interval (start, stop)
187
+ :param J: ground truth interval, a non empty interval, with empty intersection with I
188
+ :param E: the affiliation/influence zone for J, represented as a couple (start, stop)
189
+ :return: the integral $\int_{d_min}^{d_max} \min(m, x) dx$
190
+ """
191
+ if interval_intersection(I, J) is not None:
192
+ raise ValueError('I and J should have a void intersection')
193
+ if not interval_subset(J, E):
194
+ raise ValueError('J should be included in E')
195
+ if not interval_subset(I, E):
196
+ raise ValueError('I should be included in E')
197
+
198
+ e_min = min(E)
199
+ j_min = min(J)
200
+ j_max = max(J)
201
+ e_max = max(E)
202
+ i_min = min(I)
203
+ i_max = max(I)
204
+
205
+ d_min = max(i_min - j_max, j_min - i_max)
206
+ d_max = max(i_max - j_max, j_min - i_min)
207
+ m = min(j_min - e_min, e_max - j_max)
208
+ A = min(d_max, m)**2 - min(d_min, m)**2
209
+ B = max(d_max, m) - max(d_min, m)
210
+ C = (1/2)*A + m*B
211
+ return(C)
212
+
213
+ def integral_mini_interval_Pprecision_CDFmethod(I, J, E):
214
+ """
215
+ Integral of the probability of distances over the interval I.
216
+ In the specific case where interval I is located outside J,
217
+ compute the integral $\int_{x \in I} Fbar(dist(x,J)) dx$.
218
+ This is the *integral* i.e. the sum (not the mean)
219
+
220
+ :param I: a single predicted interval, a non empty interval (start, stop)
221
+ :param J: ground truth interval, a non empty interval, with empty intersection with I
222
+ :param E: the affiliation/influence zone for J, represented as a couple (start, stop)
223
+ :return: the integral $\int_{x \in I} Fbar(dist(x,J)) dx$
224
+ """
225
+ integral_min_piece = integral_mini_interval_P_CDFmethod__min_piece(I, J, E)
226
+
227
+ e_min = min(E)
228
+ j_min = min(J)
229
+ j_max = max(J)
230
+ e_max = max(E)
231
+ i_min = min(I)
232
+ i_max = max(I)
233
+ d_min = max(i_min - j_max, j_min - i_max)
234
+ d_max = max(i_max - j_max, j_min - i_min)
235
+ integral_linear_piece = (1/2)*(d_max**2 - d_min**2)
236
+ integral_remaining_piece = (j_max - j_min)*(i_max - i_min)
237
+
238
+ DeltaI = i_max - i_min
239
+ DeltaE = e_max - e_min
240
+
241
+ output = DeltaI - (1/DeltaE)*(integral_min_piece + integral_linear_piece + integral_remaining_piece)
242
+ return(output)
243
+
244
+ def integral_interval_probaCDF_precision(I, J, E):
245
+ """
246
+ Integral of the probability of distances over the interval I.
247
+ Compute the integral $\int_{x \in I} Fbar(dist(x,J)) dx$.
248
+ This is the *integral* i.e. the sum (not the mean)
249
+
250
+ :param I: a single (non empty) predicted interval in the zone of affiliation of J
251
+ :param J: ground truth interval
252
+ :param E: affiliation/influence zone for J
253
+ :return: the integral $\int_{x \in I} Fbar(dist(x,J)) dx$
254
+ """
255
+ # I and J are single intervals (not generic sets)
256
+ def f(I_cut):
257
+ if I_cut is None:
258
+ return(0)
259
+ else:
260
+ return(integral_mini_interval_Pprecision_CDFmethod(I_cut, J, E))
261
+
262
+ # If I_middle is fully included into J, it is
263
+ # integral of 1 on the interval I_middle, so it's |I_middle|
264
+ def f0(I_middle):
265
+ if I_middle is None:
266
+ return(0)
267
+ else:
268
+ return(max(I_middle) - min(I_middle))
269
+
270
+ cut_into_three = cut_into_three_func(I, J)
271
+ # Distance for now, not the mean:
272
+ # Distance left: Between cut_into_three[0] and the point min(J)
273
+ d_left = f(cut_into_three[0])
274
+ # Distance middle: Between cut_into_three[1] = I inter J, and J
275
+ d_middle = f0(cut_into_three[1])
276
+ # Distance right: Between cut_into_three[2] and the point max(J)
277
+ d_right = f(cut_into_three[2])
278
+ # It's an integral so summable
279
+ return(d_left + d_middle + d_right)
280
+
281
+ def cut_J_based_on_mean_func(J, e_mean):
282
+ """
283
+ Helper function for the recall.
284
+ Partition J into two intervals: before and after e_mean
285
+ (e_mean represents the center element of E the zone of affiliation)
286
+
287
+ :param J: ground truth interval
288
+ :param e_mean: a float number (center value of E)
289
+ :return: a couple partitionning J into (J_before, J_after)
290
+ """
291
+ if J is None:
292
+ J_before = None
293
+ J_after = None
294
+ elif e_mean >= max(J):
295
+ J_before = J
296
+ J_after = None
297
+ elif e_mean <= min(J):
298
+ J_before = None
299
+ J_after = J
300
+ else: # e_mean is across J
301
+ J_before = (min(J), e_mean)
302
+ J_after = (e_mean, max(J))
303
+
304
+ return((J_before, J_after))
305
+
306
+ def integral_mini_interval_Precall_CDFmethod(I, J, E):
307
+ """
308
+ Integral of the probability of distances over the interval J.
309
+ In the specific case where interval J is located outside I,
310
+ compute the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$.
311
+ This is the *integral* i.e. the sum (not the mean)
312
+
313
+ :param I: a single (non empty) predicted interval
314
+ :param J: ground truth (non empty) interval, with empty intersection with I
315
+ :param E: the affiliation/influence zone for J, represented as a couple (start, stop)
316
+ :return: the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$
317
+ """
318
+ # The interval J should be located outside I
319
+ # (so it's either the left piece or the right piece w.r.t I)
320
+ i_pivot = get_pivot_j(J, I)
321
+ e_min = min(E)
322
+ e_max = max(E)
323
+ e_mean = (e_min + e_max) / 2
324
+
325
+ # If i_pivot is outside E (it's possible), then
326
+ # the distance is worst that any random element within E,
327
+ # so we set the recall to 0
328
+ if i_pivot <= min(E):
329
+ return(0)
330
+ elif i_pivot >= max(E):
331
+ return(0)
332
+ # Otherwise, we have at least i_pivot in E and so d < M so min(d,M)=d
333
+
334
+ cut_J_based_on_e_mean = cut_J_based_on_mean_func(J, e_mean)
335
+ J_before = cut_J_based_on_e_mean[0]
336
+ J_after = cut_J_based_on_e_mean[1]
337
+
338
+ iemin_mean = (e_min + i_pivot)/2
339
+ cut_Jbefore_based_on_iemin_mean = cut_J_based_on_mean_func(J_before, iemin_mean)
340
+ J_before_closeE = cut_Jbefore_based_on_iemin_mean[0] # before e_mean and closer to e_min than i_pivot ~ J_before_before
341
+ J_before_closeI = cut_Jbefore_based_on_iemin_mean[1] # before e_mean and closer to i_pivot than e_min ~ J_before_after
342
+
343
+ iemax_mean = (e_max + i_pivot)/2
344
+ cut_Jafter_based_on_iemax_mean = cut_J_based_on_mean_func(J_after, iemax_mean)
345
+ J_after_closeI = cut_Jafter_based_on_iemax_mean[0] # after e_mean and closer to i_pivot than e_max ~ J_after_before
346
+ J_after_closeE = cut_Jafter_based_on_iemax_mean[1] # after e_mean and closer to e_max than i_pivot ~ J_after_after
347
+
348
+ if J_before_closeE is not None:
349
+ j_before_before_min = min(J_before_closeE) # == min(J)
350
+ j_before_before_max = max(J_before_closeE)
351
+ else:
352
+ j_before_before_min = math.nan
353
+ j_before_before_max = math.nan
354
+
355
+ if J_before_closeI is not None:
356
+ j_before_after_min = min(J_before_closeI) # == j_before_before_max if existing
357
+ j_before_after_max = max(J_before_closeI) # == max(J_before)
358
+ else:
359
+ j_before_after_min = math.nan
360
+ j_before_after_max = math.nan
361
+
362
+ if J_after_closeI is not None:
363
+ j_after_before_min = min(J_after_closeI) # == min(J_after)
364
+ j_after_before_max = max(J_after_closeI)
365
+ else:
366
+ j_after_before_min = math.nan
367
+ j_after_before_max = math.nan
368
+
369
+ if J_after_closeE is not None:
370
+ j_after_after_min = min(J_after_closeE) # == j_after_before_max if existing
371
+ j_after_after_max = max(J_after_closeE) # == max(J)
372
+ else:
373
+ j_after_after_min = math.nan
374
+ j_after_after_max = math.nan
375
+
376
+ # <-- J_before_closeE --> <-- J_before_closeI --> <-- J_after_closeI --> <-- J_after_closeE -->
377
+ # j_bb_min j_bb_max j_ba_min j_ba_max j_ab_min j_ab_max j_aa_min j_aa_max
378
+ # (with `b` for before and `a` for after in the previous variable names)
379
+
380
+ # vs e_mean m = min(t-e_min, e_max-t) d=|i_pivot-t| min(d,m) \int min(d,m)dt \int d dt \int_(min(d,m)+d)dt \int_{t \in J}(min(d,m)+d)dt
381
+ # Case J_before_closeE & i_pivot after J before t-e_min i_pivot-t min(i_pivot-t,t-e_min) = t-e_min t^2/2-e_min*t i_pivot*t-t^2/2 t^2/2-e_min*t+i_pivot*t-t^2/2 = (i_pivot-e_min)*t (i_pivot-e_min)*tB - (i_pivot-e_min)*tA = (i_pivot-e_min)*(tB-tA)
382
+ # Case J_before_closeI & i_pivot after J before t-e_min i_pivot-t min(i_pivot-t,t-e_min) = i_pivot-t i_pivot*t-t^2/2 i_pivot*t-t^2/2 i_pivot*t-t^2/2+i_pivot*t-t^2/2 = 2*i_pivot*t-t^2 2*i_pivot*tB-tB^2 - 2*i_pivot*tA + tA^2 = 2*i_pivot*(tB-tA) - (tB^2 - tA^2)
383
+ # Case J_after_closeI & i_pivot after J after e_max-t i_pivot-t min(i_pivot-t,e_max-t) = i_pivot-t i_pivot*t-t^2/2 i_pivot*t-t^2/2 i_pivot*t-t^2/2+i_pivot*t-t^2/2 = 2*i_pivot*t-t^2 2*i_pivot*tB-tB^2 - 2*i_pivot*tA + tA^2 = 2*i_pivot*(tB-tA) - (tB^2 - tA^2)
384
+ # Case J_after_closeE & i_pivot after J after e_max-t i_pivot-t min(i_pivot-t,e_max-t) = e_max-t e_max*t-t^2/2 i_pivot*t-t^2/2 e_max*t-t^2/2+i_pivot*t-t^2/2 = (e_max+i_pivot)*t-t^2 (e_max+i_pivot)*tB-tB^2 - (e_max+i_pivot)*tA + tA^2 = (e_max+i_pivot)*(tB-tA) - (tB^2 - tA^2)
385
+ #
386
+ # Case J_before_closeE & i_pivot before J before t-e_min t-i_pivot min(t-i_pivot,t-e_min) = t-e_min t^2/2-e_min*t t^2/2-i_pivot*t t^2/2-e_min*t+t^2/2-i_pivot*t = t^2-(e_min+i_pivot)*t tB^2-(e_min+i_pivot)*tB - tA^2 + (e_min+i_pivot)*tA = (tB^2 - tA^2) - (e_min+i_pivot)*(tB-tA)
387
+ # Case J_before_closeI & i_pivot before J before t-e_min t-i_pivot min(t-i_pivot,t-e_min) = t-i_pivot t^2/2-i_pivot*t t^2/2-i_pivot*t t^2/2-i_pivot*t+t^2/2-i_pivot*t = t^2-2*i_pivot*t tB^2-2*i_pivot*tB - tA^2 + 2*i_pivot*tA = (tB^2 - tA^2) - 2*i_pivot*(tB-tA)
388
+ # Case J_after_closeI & i_pivot before J after e_max-t t-i_pivot min(t-i_pivot,e_max-t) = t-i_pivot t^2/2-i_pivot*t t^2/2-i_pivot*t t^2/2-i_pivot*t+t^2/2-i_pivot*t = t^2-2*i_pivot*t tB^2-2*i_pivot*tB - tA^2 + 2*i_pivot*tA = (tB^2 - tA^2) - 2*i_pivot*(tB-tA)
389
+ # Case J_after_closeE & i_pivot before J after e_max-t t-i_pivot min(t-i_pivot,e_max-t) = e_max-t e_max*t-t^2/2 t^2/2-i_pivot*t e_max*t-t^2/2+t^2/2-i_pivot*t = (e_max-i_pivot)*t (e_max-i_pivot)*tB - (e_max-i_pivot)*tA = (e_max-i_pivot)*(tB-tA)
390
+
391
+ if i_pivot >= max(J):
392
+ part1_before_closeE = (i_pivot-e_min)*(j_before_before_max - j_before_before_min) # (i_pivot-e_min)*(tB-tA) # j_before_before_max - j_before_before_min
393
+ part2_before_closeI = 2*i_pivot*(j_before_after_max-j_before_after_min) - (j_before_after_max**2 - j_before_after_min**2) # 2*i_pivot*(tB-tA) - (tB^2 - tA^2) # j_before_after_max - j_before_after_min
394
+ part3_after_closeI = 2*i_pivot*(j_after_before_max-j_after_before_min) - (j_after_before_max**2 - j_after_before_min**2) # 2*i_pivot*(tB-tA) - (tB^2 - tA^2) # j_after_before_max - j_after_before_min
395
+ part4_after_closeE = (e_max+i_pivot)*(j_after_after_max-j_after_after_min) - (j_after_after_max**2 - j_after_after_min**2) # (e_max+i_pivot)*(tB-tA) - (tB^2 - tA^2) # j_after_after_max - j_after_after_min
396
+ out_parts = [part1_before_closeE, part2_before_closeI, part3_after_closeI, part4_after_closeE]
397
+ elif i_pivot <= min(J):
398
+ part1_before_closeE = (j_before_before_max**2 - j_before_before_min**2) - (e_min+i_pivot)*(j_before_before_max-j_before_before_min) # (tB^2 - tA^2) - (e_min+i_pivot)*(tB-tA) # j_before_before_max - j_before_before_min
399
+ part2_before_closeI = (j_before_after_max**2 - j_before_after_min**2) - 2*i_pivot*(j_before_after_max-j_before_after_min) # (tB^2 - tA^2) - 2*i_pivot*(tB-tA) # j_before_after_max - j_before_after_min
400
+ part3_after_closeI = (j_after_before_max**2 - j_after_before_min**2) - 2*i_pivot*(j_after_before_max - j_after_before_min) # (tB^2 - tA^2) - 2*i_pivot*(tB-tA) # j_after_before_max - j_after_before_min
401
+ part4_after_closeE = (e_max-i_pivot)*(j_after_after_max - j_after_after_min) # (e_max-i_pivot)*(tB-tA) # j_after_after_max - j_after_after_min
402
+ out_parts = [part1_before_closeE, part2_before_closeI, part3_after_closeI, part4_after_closeE]
403
+ else:
404
+ raise ValueError('The i_pivot should be outside J')
405
+
406
+ out_integral_min_dm_plus_d = _sum_wo_nan(out_parts) # integral on all J, i.e. sum of the disjoint parts
407
+
408
+ # We have for each point t of J:
409
+ # \bar{F}_{t, recall}(d) = 1 - (1/|E|) * (min(d,m) + d)
410
+ # Since t is a single-point here, and we are in the case where i_pivot is inside E.
411
+ # The integral is then given by:
412
+ # C = \int_{t \in J} \bar{F}_{t, recall}(D(t)) dt
413
+ # = \int_{t \in J} 1 - (1/|E|) * (min(d,m) + d) dt
414
+ # = |J| - (1/|E|) * [\int_{t \in J} (min(d,m) + d) dt]
415
+ # = |J| - (1/|E|) * out_integral_min_dm_plus_d
416
+ DeltaJ = max(J) - min(J)
417
+ DeltaE = max(E) - min(E)
418
+ C = DeltaJ - (1/DeltaE) * out_integral_min_dm_plus_d
419
+
420
+ return(C)
421
+
422
+ def integral_interval_probaCDF_recall(I, J, E):
423
+ """
424
+ Integral of the probability of distances over the interval J.
425
+ Compute the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$.
426
+ This is the *integral* i.e. the sum (not the mean)
427
+
428
+ :param I: a single (non empty) predicted interval
429
+ :param J: ground truth (non empty) interval
430
+ :param E: the affiliation/influence zone for J
431
+ :return: the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$
432
+ """
433
+ # I and J are single intervals (not generic sets)
434
+ # E is the outside affiliation interval of J (even for recall!)
435
+ # (in particular J \subset E)
436
+ #
437
+ # J is the portion of the ground truth affiliated to I
438
+ # I is a predicted interval (can be outside E possibly since it's recall)
439
+ def f(J_cut):
440
+ if J_cut is None:
441
+ return(0)
442
+ else:
443
+ return integral_mini_interval_Precall_CDFmethod(I, J_cut, E)
444
+
445
+ # If J_middle is fully included into I, it is
446
+ # integral of 1 on the interval J_middle, so it's |J_middle|
447
+ def f0(J_middle):
448
+ if J_middle is None:
449
+ return(0)
450
+ else:
451
+ return(max(J_middle) - min(J_middle))
452
+
453
+ cut_into_three = cut_into_three_func(J, I) # it's J that we cut into 3, depending on the position w.r.t I
454
+ # since we integrate over J this time.
455
+ #
456
+ # Distance for now, not the mean:
457
+ # Distance left: Between cut_into_three[0] and the point min(I)
458
+ d_left = f(cut_into_three[0])
459
+ # Distance middle: Between cut_into_three[1] = J inter I, and I
460
+ d_middle = f0(cut_into_three[1])
461
+ # Distance right: Between cut_into_three[2] and the point max(I)
462
+ d_right = f(cut_into_three[2])
463
+ # It's an integral so summable
464
+ return(d_left + d_middle + d_right)
evaluation/affiliation/_single_ground_truth_event.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ import math
4
+ from ._affiliation_zone import (
5
+ get_all_E_gt_func,
6
+ affiliation_partition)
7
+ from ._integral_interval import (
8
+ integral_interval_distance,
9
+ integral_interval_probaCDF_precision,
10
+ integral_interval_probaCDF_recall,
11
+ interval_length,
12
+ sum_interval_lengths)
13
+
14
+ def affiliation_precision_distance(Is = [(1,2),(3,4),(5,6)], J = (2,5.5)):
15
+ """
16
+ Compute the individual average distance from Is to a single ground truth J
17
+
18
+ :param Is: list of predicted events within the affiliation zone of J
19
+ :param J: couple representating the start and stop of a ground truth interval
20
+ :return: individual average precision directed distance number
21
+ """
22
+ if all([I is None for I in Is]): # no prediction in the current area
23
+ return(math.nan) # undefined
24
+ return(sum([integral_interval_distance(I, J) for I in Is]) / sum_interval_lengths(Is))
25
+
26
+ def affiliation_precision_proba(Is = [(1,2),(3,4),(5,6)], J = (2,5.5), E = (0,8)):
27
+ """
28
+ Compute the individual precision probability from Is to a single ground truth J
29
+
30
+ :param Is: list of predicted events within the affiliation zone of J
31
+ :param J: couple representating the start and stop of a ground truth interval
32
+ :param E: couple representing the start and stop of the zone of affiliation of J
33
+ :return: individual precision probability in [0, 1], or math.nan if undefined
34
+ """
35
+ if all([I is None for I in Is]): # no prediction in the current area
36
+ return(math.nan) # undefined
37
+ return(sum([integral_interval_probaCDF_precision(I, J, E) for I in Is]) / sum_interval_lengths(Is))
38
+
39
+ def affiliation_recall_distance(Is = [(1,2),(3,4),(5,6)], J = (2,5.5)):
40
+ """
41
+ Compute the individual average distance from a single J to the predictions Is
42
+
43
+ :param Is: list of predicted events within the affiliation zone of J
44
+ :param J: couple representating the start and stop of a ground truth interval
45
+ :return: individual average recall directed distance number
46
+ """
47
+ Is = [I for I in Is if I is not None] # filter possible None in Is
48
+ if len(Is) == 0: # there is no prediction in the current area
49
+ return(math.inf)
50
+ E_gt_recall = get_all_E_gt_func(Is, (-math.inf, math.inf)) # here from the point of view of the predictions
51
+ Js = affiliation_partition([J], E_gt_recall) # partition of J depending of proximity with Is
52
+ return(sum([integral_interval_distance(J[0], I) for I, J in zip(Is, Js)]) / interval_length(J))
53
+
54
+ def affiliation_recall_proba(Is = [(1,2),(3,4),(5,6)], J = (2,5.5), E = (0,8)):
55
+ """
56
+ Compute the individual recall probability from a single ground truth J to Is
57
+
58
+ :param Is: list of predicted events within the affiliation zone of J
59
+ :param J: couple representating the start and stop of a ground truth interval
60
+ :param E: couple representing the start and stop of the zone of affiliation of J
61
+ :return: individual recall probability in [0, 1]
62
+ """
63
+ Is = [I for I in Is if I is not None] # filter possible None in Is
64
+ if len(Is) == 0: # there is no prediction in the current area
65
+ return(0)
66
+ E_gt_recall = get_all_E_gt_func(Is, E) # here from the point of view of the predictions
67
+ Js = affiliation_partition([J], E_gt_recall) # partition of J depending of proximity with Is
68
+ return(sum([integral_interval_probaCDF_recall(I, J[0], E) for I, J in zip(Is, Js)]) / interval_length(J))
evaluation/affiliation/generics.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ from itertools import groupby
4
+ from operator import itemgetter
5
+ import math
6
+ import gzip
7
+ import glob
8
+ import os
9
+
10
+ def convert_vector_to_events(vector = [0, 1, 1, 0, 0, 1, 0]):
11
+ """
12
+ Convert a binary vector (indicating 1 for the anomalous instances)
13
+ to a list of events. The events are considered as durations,
14
+ i.e. setting 1 at index i corresponds to an anomalous interval [i, i+1).
15
+
16
+ :param vector: a list of elements belonging to {0, 1}
17
+ :return: a list of couples, each couple representing the start and stop of
18
+ each event
19
+ """
20
+ positive_indexes = [idx for idx, val in enumerate(vector) if val > 0]
21
+ events = []
22
+ for k, g in groupby(enumerate(positive_indexes), lambda ix : ix[0] - ix[1]):
23
+ cur_cut = list(map(itemgetter(1), g))
24
+ events.append((cur_cut[0], cur_cut[-1]))
25
+
26
+ # Consistent conversion in case of range anomalies (for indexes):
27
+ # A positive index i is considered as the interval [i, i+1),
28
+ # so the last index should be moved by 1
29
+ events = [(x, y+1) for (x,y) in events]
30
+
31
+ return(events)
32
+
33
+ def infer_Trange(events_pred, events_gt):
34
+ """
35
+ Given the list of events events_pred and events_gt, get the
36
+ smallest possible Trange corresponding to the start and stop indexes
37
+ of the whole series.
38
+ Trange will not influence the measure of distances, but will impact the
39
+ measures of probabilities.
40
+
41
+ :param events_pred: a list of couples corresponding to predicted events
42
+ :param events_gt: a list of couples corresponding to ground truth events
43
+ :return: a couple corresponding to the smallest range containing the events
44
+ """
45
+ if len(events_gt) == 0:
46
+ raise ValueError('The gt events should contain at least one event')
47
+ if len(events_pred) == 0:
48
+ # empty prediction, base Trange only on events_gt (which is non empty)
49
+ return(infer_Trange(events_gt, events_gt))
50
+
51
+ min_pred = min([x[0] for x in events_pred])
52
+ min_gt = min([x[0] for x in events_gt])
53
+ max_pred = max([x[1] for x in events_pred])
54
+ max_gt = max([x[1] for x in events_gt])
55
+ Trange = (min(min_pred, min_gt), max(max_pred, max_gt))
56
+ return(Trange)
57
+
58
+ def has_point_anomalies(events):
59
+ """
60
+ Checking whether events contain point anomalies, i.e.
61
+ events starting and stopping at the same time.
62
+
63
+ :param events: a list of couples corresponding to predicted events
64
+ :return: True is the events have any point anomalies, False otherwise
65
+ """
66
+ if len(events) == 0:
67
+ return(False)
68
+ return(min([x[1] - x[0] for x in events]) == 0)
69
+
70
+ def _sum_wo_nan(vec):
71
+ """
72
+ Sum of elements, ignoring math.isnan ones
73
+
74
+ :param vec: vector of floating numbers
75
+ :return: sum of the elements, ignoring math.isnan ones
76
+ """
77
+ vec_wo_nan = [e for e in vec if not math.isnan(e)]
78
+ return(sum(vec_wo_nan))
79
+
80
+ def _len_wo_nan(vec):
81
+ """
82
+ Count of elements, ignoring math.isnan ones
83
+
84
+ :param vec: vector of floating numbers
85
+ :return: count of the elements, ignoring math.isnan ones
86
+ """
87
+ vec_wo_nan = [e for e in vec if not math.isnan(e)]
88
+ return(len(vec_wo_nan))
89
+
90
+ def read_gz_data(filename = 'data/machinetemp_groundtruth.gz'):
91
+ """
92
+ Load a file compressed with gz, such that each line of the
93
+ file is either 0 (representing a normal instance) or 1 (representing)
94
+ an anomalous instance.
95
+ :param filename: file path to the gz compressed file
96
+ :return: list of integers with either 0 or 1
97
+ """
98
+ with gzip.open(filename, 'rb') as f:
99
+ content = f.read().splitlines()
100
+ content = [int(x) for x in content]
101
+ return(content)
102
+
103
+ def read_all_as_events():
104
+ """
105
+ Load the files contained in the folder `data/` and convert
106
+ to events. The length of the series is kept.
107
+ The convention for the file name is: `dataset_algorithm.gz`
108
+ :return: two dictionaries:
109
+ - the first containing the list of events for each dataset and algorithm,
110
+ - the second containing the range of the series for each dataset
111
+ """
112
+ filepaths = glob.glob('data/*.gz')
113
+ datasets = dict()
114
+ Tranges = dict()
115
+ for filepath in filepaths:
116
+ vector = read_gz_data(filepath)
117
+ events = convert_vector_to_events(vector)
118
+ # ad hoc cut for those files
119
+ cut_filepath = (os.path.split(filepath)[1]).split('_')
120
+ data_name = cut_filepath[0]
121
+ algo_name = (cut_filepath[1]).split('.')[0]
122
+ if not data_name in datasets:
123
+ datasets[data_name] = dict()
124
+ Tranges[data_name] = (0, len(vector))
125
+ datasets[data_name][algo_name] = events
126
+ return(datasets, Tranges)
127
+
128
+ def f1_func(p, r):
129
+ """
130
+ Compute the f1 function
131
+ :param p: precision numeric value
132
+ :param r: recall numeric value
133
+ :return: f1 numeric value
134
+ """
135
+ return(2*p*r/(p+r))
evaluation/affiliation/metrics.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ from .generics import (
4
+ infer_Trange,
5
+ has_point_anomalies,
6
+ _len_wo_nan,
7
+ _sum_wo_nan,
8
+ read_all_as_events)
9
+ from ._affiliation_zone import (
10
+ get_all_E_gt_func,
11
+ affiliation_partition)
12
+ from ._single_ground_truth_event import (
13
+ affiliation_precision_distance,
14
+ affiliation_recall_distance,
15
+ affiliation_precision_proba,
16
+ affiliation_recall_proba)
17
+
18
+ def test_events(events):
19
+ """
20
+ Verify the validity of the input events
21
+ :param events: list of events, each represented by a couple (start, stop)
22
+ :return: None. Raise an error for incorrect formed or non ordered events
23
+ """
24
+ if type(events) is not list:
25
+ raise TypeError('Input `events` should be a list of couples')
26
+ if not all([type(x) is tuple for x in events]):
27
+ raise TypeError('Input `events` should be a list of tuples')
28
+ if not all([len(x) == 2 for x in events]):
29
+ raise ValueError('Input `events` should be a list of couples (start, stop)')
30
+ if not all([x[0] <= x[1] for x in events]):
31
+ raise ValueError('Input `events` should be a list of couples (start, stop) with start <= stop')
32
+ if not all([events[i][1] < events[i+1][0] for i in range(len(events) - 1)]):
33
+ raise ValueError('Couples of input `events` should be disjoint and ordered')
34
+
35
+ def pr_from_events(events_pred, events_gt, Trange):
36
+ """
37
+ Compute the affiliation metrics including the precision/recall in [0,1],
38
+ along with the individual precision/recall distances and probabilities
39
+
40
+ :param events_pred: list of predicted events, each represented by a couple
41
+ indicating the start and the stop of the event
42
+ :param events_gt: list of ground truth events, each represented by a couple
43
+ indicating the start and the stop of the event
44
+ :param Trange: range of the series where events_pred and events_gt are included,
45
+ represented as a couple (start, stop)
46
+ :return: dictionary with precision, recall, and the individual metrics
47
+ """
48
+ # testing the inputs
49
+ test_events(events_pred)
50
+ test_events(events_gt)
51
+
52
+ # other tests
53
+ minimal_Trange = infer_Trange(events_pred, events_gt)
54
+ if not Trange[0] <= minimal_Trange[0]:
55
+ raise ValueError('`Trange` should include all the events')
56
+ if not minimal_Trange[1] <= Trange[1]:
57
+ raise ValueError('`Trange` should include all the events')
58
+
59
+ if len(events_gt) == 0:
60
+ raise ValueError('Input `events_gt` should have at least one event')
61
+
62
+ if has_point_anomalies(events_pred) or has_point_anomalies(events_gt):
63
+ raise ValueError('Cannot manage point anomalies currently')
64
+
65
+ if Trange is None:
66
+ # Set as default, but Trange should be indicated if probabilities are used
67
+ raise ValueError('Trange should be indicated (or inferred with the `infer_Trange` function')
68
+
69
+ E_gt = get_all_E_gt_func(events_gt, Trange)
70
+ aff_partition = affiliation_partition(events_pred, E_gt)
71
+
72
+ # Computing precision distance
73
+ d_precision = [affiliation_precision_distance(Is, J) for Is, J in zip(aff_partition, events_gt)]
74
+
75
+ # Computing recall distance
76
+ d_recall = [affiliation_recall_distance(Is, J) for Is, J in zip(aff_partition, events_gt)]
77
+
78
+ # Computing precision
79
+ p_precision = [affiliation_precision_proba(Is, J, E) for Is, J, E in zip(aff_partition, events_gt, E_gt)]
80
+
81
+ # Computing recall
82
+ p_recall = [affiliation_recall_proba(Is, J, E) for Is, J, E in zip(aff_partition, events_gt, E_gt)]
83
+
84
+ if _len_wo_nan(p_precision) > 0:
85
+ p_precision_average = _sum_wo_nan(p_precision) / _len_wo_nan(p_precision)
86
+ else:
87
+ p_precision_average = p_precision[0] # math.nan
88
+ p_recall_average = sum(p_recall) / len(p_recall)
89
+
90
+ dict_out = dict({'Affiliation_Precision': p_precision_average,
91
+ 'Affiliation_Recall': p_recall_average,
92
+ 'individual_precision_probabilities': p_precision,
93
+ 'individual_recall_probabilities': p_recall,
94
+ 'individual_precision_distances': d_precision,
95
+ 'individual_recall_distances': d_recall})
96
+ return(dict_out)
97
+
98
+ def produce_all_results():
99
+ """
100
+ Produce the affiliation precision/recall for all files
101
+ contained in the `data` repository
102
+ :return: a dictionary indexed by data names, each containing a dictionary
103
+ indexed by algorithm names, each containing the results of the affiliation
104
+ metrics (precision, recall, individual probabilities and distances)
105
+ """
106
+ datasets, Tranges = read_all_as_events() # read all the events in folder `data`
107
+ results = dict()
108
+ for data_name in datasets.keys():
109
+ results_data = dict()
110
+ for algo_name in datasets[data_name].keys():
111
+ if algo_name != 'groundtruth':
112
+ results_data[algo_name] = pr_from_events(datasets[data_name][algo_name],
113
+ datasets[data_name]['groundtruth'],
114
+ Tranges[data_name])
115
+ results[data_name] = results_data
116
+ return(results)
evaluation/basic_metrics.py ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/metrics.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+ from .basic_metrics import basic_metricor, generate_curve
4
+ from statsmodels.tsa.stattools import acf
5
+ from scipy.signal import argrelextrema
6
+ import numpy as np
7
+ import multiprocessing
8
+
9
+ import multiprocessing
10
+ import numpy as np
11
+ import torch
12
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
13
+ from functools import partial
14
+ from tqdm import tqdm
15
+ import time
16
+
17
+ # ============== Parallelized Affiliation ==============
18
+
19
+ def _compute_auc_roc(labels, score):
20
+ grader = basic_metricor()
21
+ try:
22
+ return grader.metric_ROC(labels, score)
23
+ except Exception:
24
+ return 0.0
25
+
26
+ def _compute_auc_pr(labels, score):
27
+ grader = basic_metricor()
28
+ try:
29
+ return grader.metric_PR(labels, score)
30
+ except Exception:
31
+ return 0.0
32
+
33
+ def _compute_vus(labels, score, slidingWindow, version):
34
+ try:
35
+ _, _, _, _, _, _, VUS_ROC, VUS_PR = generate_curve(labels.astype(int), score, slidingWindow, version)
36
+ return VUS_ROC, VUS_PR
37
+ except Exception:
38
+ return 0.0, 0.0
39
+
40
+ def _compute_pointf1(labels, score):
41
+ # print("Evaluating F1 standard...")
42
+ grader = basic_metricor()
43
+ try:
44
+ # print("Using chunked parallel F1 computation...")
45
+ return grader.metric_standard_F1_chunked(
46
+ true_labels=labels,
47
+ anomaly_scores=score,
48
+ chunk_size=25, # Process 25 thresholds per chunk
49
+ num_workers=4 # Use 4 parallel workers
50
+ )
51
+ except Exception:
52
+ # print("F1 standard computation failed, returning zeros.")
53
+ return {'F1': 0.0, 'Precision': 0.0, 'Recall': 0.0}
54
+
55
+ def _compute_pointf1pa(labels, score):
56
+ grader = basic_metricor()
57
+ try:
58
+ return grader.metric_PointF1PA_chunked(
59
+ label=labels,
60
+ score=score,
61
+ chunk_size=30, # Process 30 quantiles per chunk
62
+ num_workers=6 # Use 6 parallel workers
63
+ )
64
+ except Exception:
65
+ return {'F1_PA': 0.0, 'P_PA': 0.0, 'R_PA': 0.0}
66
+
67
+ def _compute_affiliation(labels, score):
68
+ grader = basic_metricor()
69
+ try:
70
+ return grader.metric_Affiliation(labels, score)
71
+ except Exception:
72
+ return 0.0, 0.0, 0.0
73
+
74
+ def _compute_t_score(labels, score):
75
+ grader = basic_metricor()
76
+ try:
77
+ return grader.metric_F1_T(labels, score)
78
+ except Exception:
79
+ return {'F1_T': 0.0, 'P_T': 0.0, 'R_T': 0.0}
80
+
81
+ def _compute_f1_t(labels, score):
82
+ grader = basic_metricor()
83
+ try:
84
+ # Use non-parallel path here to avoid pickling issues inside thread workers
85
+ # metric_F1_T(use_parallel=False) runs in-process and returns a dict
86
+ return grader.metric_F1_T(labels, score, use_parallel=True)
87
+ except Exception:
88
+ # Always return a dict to keep downstream code consistent
89
+ return {'F1_T': 0.0, 'P_T': 0.0, 'R_T': 0.0}
90
+
91
+ def _run_task(func, args):
92
+ return func(*args)
93
+
94
+
95
+ def get_metrics_optimized(score, labels, slidingWindow=100, pred=None, version='opt', thre=250):
96
+ """
97
+ Fully optimized metrics computation with proper parallelization
98
+ """
99
+ metrics = {}
100
+ start_total = time.time()
101
+
102
+ # Ensure proper data types to avoid float/integer issues
103
+ labels = np.asarray(labels, dtype=int)
104
+ score = np.asarray(score, dtype=float)
105
+
106
+ # Determine optimal number of workers based on CPU count and workload
107
+ n_cores = multiprocessing.cpu_count()
108
+
109
+ # For threshold-iterating functions (affiliation and F1_T)
110
+ # Use more workers since they have heavy loops
111
+ heavy_workers = min(n_cores - 2, 8) # Leave some cores for system
112
+
113
+ # For simple metrics
114
+ light_workers = min(n_cores // 2, 8)
115
+
116
+ print(f"Using {heavy_workers} workers for heavy metrics, {light_workers} for light metrics")
117
+
118
+ # Start the heavy computations first (they take longest)
119
+ print("Starting heavy computations (Affiliation and F1_T)...")
120
+ heavy_start = time.time()
121
+ grader = basic_metricor()
122
+ with ProcessPoolExecutor(max_workers=2) as main_executor:
123
+ # Launch the two heaviest computations with their own internal parallelization
124
+ affiliation_future = main_executor.submit(
125
+ grader._compute_affiliation_parallel,
126
+ labels,
127
+ score,
128
+ num_workers=heavy_workers
129
+ )
130
+
131
+ # t_score_future = main_executor.submit(
132
+ # grader.metric_F1_T_fast,
133
+ # labels,
134
+ # score,
135
+ # num_workers=heavy_workers*2
136
+ # )
137
+ #
138
+ # While heavy computations are running, compute light metrics
139
+ print("Computing light metrics in parallel...")
140
+ light_start = time.time()
141
+
142
+ with ThreadPoolExecutor(max_workers=light_workers) as light_executor:
143
+ light_futures = {
144
+ 'auc_roc': light_executor.submit(_compute_auc_roc, labels, score),
145
+ 'auc_pr': light_executor.submit(_compute_auc_pr, labels, score),
146
+ 'vus': light_executor.submit(_compute_vus, labels, score, slidingWindow, version),
147
+ 'pointf1': light_executor.submit(_compute_pointf1, labels, score),
148
+ 'pointf1pa': light_executor.submit(_compute_pointf1pa, labels, score),
149
+ 'f1_t': light_executor.submit(_compute_f1_t, labels, score)
150
+ }
151
+
152
+ # Collect light metric results as they complete
153
+ light_results = {}
154
+ for name, future in light_futures.items():
155
+ try:
156
+ light_results[name] = future.result()
157
+ print(f" ✓ {name} completed")
158
+ except Exception as e:
159
+ print(f" ✗ {name} failed: {e}")
160
+ light_results[name] = None
161
+
162
+ print(f"Light metrics completed in {time.time() - light_start:.2f}s")
163
+
164
+ # Wait for heavy computations to complete
165
+ print("Waiting for heavy computations...")
166
+
167
+ try:
168
+ Affiliation_F, Affiliation_P, Affiliation_R = affiliation_future.result()
169
+ print(f" ✓ Affiliation completed")
170
+ except Exception as e:
171
+ print(f" ✗ Affiliation failed: {e}")
172
+ Affiliation_F, Affiliation_P, Affiliation_R = 0.0, 0.0, 0.0
173
+
174
+ # try:
175
+ # T_score = t_score_future.result()
176
+ # print(f" ✓ F1_T completed")
177
+ # except Exception as e:
178
+ # print(f" ✗ F1_T failed: {e}")
179
+ # T_score = {'F1_T': 0.0, 'P_T': 0.0, 'R_T': 0.0}
180
+
181
+ print(f"Heavy metrics completed in {time.time() - heavy_start:.2f}s")
182
+
183
+ # Unpack light results
184
+ AUC_ROC = light_results.get('auc_roc', 0.0)
185
+ AUC_PR = light_results.get('auc_pr', 0.0)
186
+ VUS_result = light_results.get('vus', (0.0, 0.0))
187
+ if isinstance(VUS_result, tuple):
188
+ VUS_ROC, VUS_PR = VUS_result
189
+ else:
190
+ VUS_ROC, VUS_PR = 0.0, 0.0
191
+ # print("HERE IS POINTF1: ")
192
+ # print(light_results.get('pointf1',))
193
+ # sys.exit()
194
+ PointF1 = light_results.get('pointf1', {'F1': 0.0, 'Precision': 0.0, 'Recall': 0.0})
195
+ PointF1PA = light_results.get('pointf1pa', {'F1_PA': 0.0, 'P_PA': 0.0, 'R_PA': 0.0})
196
+ T_score = light_results.get('f1_t', {'F1_T': 0.0, 'P_T': 0.0, 'R_T': 0.0})
197
+ # Safeguard: if upstream returned a tuple (e.g., from an older fallback), coerce to dict
198
+ if isinstance(T_score, tuple):
199
+ try:
200
+ T_score = {'F1_T': T_score[0], 'P_T': T_score[1], 'R_T': T_score[2]}
201
+ except Exception:
202
+ T_score = {'F1_T': 0.0, 'P_T': 0.0, 'R_T': 0.0}
203
+
204
+ # Build final metrics dictionary
205
+ metrics['AUC-PR'] = AUC_PR
206
+ metrics['AUC-ROC'] = AUC_ROC
207
+ metrics['VUS-PR'] = VUS_PR
208
+ metrics['VUS-ROC'] = VUS_ROC
209
+
210
+ metrics['Standard-F1'] = PointF1.get('F1', 0.0)
211
+ metrics['Standard-Precision'] = PointF1.get('Precision', 0.0)
212
+ metrics['Standard-Recall'] = PointF1.get('Recall', 0.0)
213
+
214
+ metrics['PA-F1'] = PointF1PA.get('F1_PA', 0.0)
215
+ metrics['PA-Precision'] = PointF1PA.get('P_PA', 0.0)
216
+ metrics['PA-Recall'] = PointF1PA.get('R_PA', 0.0)
217
+
218
+ metrics['Affiliation-F'] = Affiliation_F
219
+ metrics['Affiliation-P'] = Affiliation_P
220
+ metrics['Affiliation-R'] = Affiliation_R
221
+
222
+ metrics['F1_T'] = T_score.get('F1_T', 0.0)
223
+ metrics['Precision_T'] = T_score.get('P_T', 0.0)
224
+ metrics['Recall_T'] = T_score.get('R_T', 0.0)
225
+
226
+ print(f"\nTotal computation time: {time.time() - start_total:.2f}s")
227
+
228
+ return metrics
229
+
230
+
231
+ def get_metrics(score, labels, slidingWindow=100, pred=None, version='opt', thre=250):
232
+ metrics = {}
233
+
234
+ # Ensure proper data types to avoid float/integer issues
235
+ labels = np.asarray(labels, dtype=int)
236
+ score = np.asarray(score, dtype=float)
237
+
238
+ '''
239
+ Threshold Independent
240
+ '''
241
+ grader = basic_metricor()
242
+ # AUC_ROC, Precision, Recall, PointF1, PointF1PA, Rrecall, ExistenceReward, OverlapReward, Rprecision, RF, Precision_at_k = grader.metric_new(labels, score, pred, plot_ROC=False)
243
+ try:
244
+ AUC_ROC = grader.metric_ROC(labels, score)
245
+ except Exception:
246
+ AUC_ROC = 0.0
247
+ try:
248
+ AUC_PR = grader.metric_PR(labels, score)
249
+ except Exception:
250
+ AUC_PR = 0.0
251
+
252
+ # R_AUC_ROC, R_AUC_PR, _, _, _ = grader.RangeAUC(labels=labels, score=score, window=slidingWindow, plot_ROC=True)
253
+ try:
254
+ _, _, _, _, _, _,VUS_ROC, VUS_PR = generate_curve(labels.astype(int), score, slidingWindow, version, )
255
+ except Exception:
256
+ VUS_ROC, VUS_PR = 0.0, 0.0
257
+
258
+ '''
259
+ Threshold Dependent
260
+ if pred is None --> use the oracle threshold
261
+ '''
262
+
263
+ PointF1 = grader.metric_standard_F1(labels, score,)
264
+ PointF1PA = grader.metric_PointF1PA(labels, score,)
265
+ # EventF1PA = grader.metric_EventF1PA(labels, score,)
266
+ # RF1 = grader.metric_RF1(labels, score,)
267
+ try:
268
+ Affiliation_F, Affiliation_P, Affiliation_R = grader.metric_Affiliation(labels, score)
269
+ except Exception:
270
+ Affiliation_F, Affiliation_P, Affiliation_R = 0.0, 0.0, 0.0
271
+ T_score = grader.metric_F1_T(labels, score)
272
+
273
+ metrics['AUC-PR'] = AUC_PR
274
+ metrics['AUC-ROC'] = AUC_ROC
275
+ metrics['VUS-PR'] = VUS_PR
276
+ metrics['VUS-ROC'] = VUS_ROC
277
+
278
+ metrics['Standard-F1'] = PointF1['F1']
279
+ metrics['Standard-Precision'] = PointF1['Precision']
280
+ metrics['Standard-Recall'] = PointF1['Recall']
281
+ metrics['PA-F1'] = PointF1PA['F1_PA']
282
+ metrics['PA-Precision'] = PointF1PA['P_PA']
283
+ metrics['PA-Recall'] = PointF1PA['R_PA']
284
+ # metrics['Event-based-F1'] = EventF1PA
285
+ # metrics['R-based-F1'] = RF1
286
+ metrics['Affiliation-F'] = Affiliation_F
287
+ metrics['Affiliation-P'] = Affiliation_P
288
+ metrics['Affiliation-R'] = Affiliation_R
289
+
290
+ metrics['F1_T'] = T_score['F1_T']
291
+ metrics['Precision_T'] = T_score['P_T']
292
+ metrics['Recall_T'] = T_score['R_T']
293
+
294
+ return metrics
295
+
296
+
297
+ def get_metrics_pred(score, labels, pred, slidingWindow=100):
298
+ metrics = {}
299
+
300
+ # Ensure proper data types to avoid float/integer issues
301
+ labels = np.asarray(labels, dtype=int)
302
+ score = np.asarray(score, dtype=float)
303
+ pred = np.asarray(pred, dtype=int)
304
+
305
+ grader = basic_metricor()
306
+
307
+ PointF1 = grader.standard_F1(labels, score, preds=pred)
308
+ PointF1PA = grader.metric_PointF1PA(labels, score, preds=pred)
309
+ EventF1PA = grader.metric_EventF1PA(labels, score, preds=pred)
310
+ RF1 = grader.metric_RF1(labels, score, preds=pred)
311
+ Affiliation_F, Affiliation_P, Affiliation_R = grader.metric_Affiliation(labels, score, preds=pred)
312
+ VUS_R, VUS_P, VUS_F = grader.metric_VUS_pred(labels, preds=pred, windowSize=slidingWindow)
313
+
314
+ metrics['Standard-F1'] = PointF1['F1']
315
+ metrics['Standard-Precision'] = PointF1['Precision']
316
+ metrics['Standard-Recall'] = PointF1['Recall']
317
+ metrics['PA-F1'] = PointF1PA
318
+ metrics['Event-based-F1'] = EventF1PA
319
+ metrics['R-based-F1'] = RF1
320
+ metrics['Affiliation-F'] = Affiliation_F
321
+ metrics['Affiliation-P'] = Affiliation_P
322
+ metrics['Affiliation-R'] = Affiliation_R
323
+
324
+ metrics['VUS-Recall'] = VUS_R
325
+ metrics['VUS-Precision'] = VUS_P
326
+ metrics['VUS-F'] = VUS_F
327
+
328
+ return metrics
329
+
330
+ def find_length_rank(data, rank=1):
331
+ data = data.squeeze()
332
+ if len(data.shape) > 1:
333
+ return 0
334
+ if rank == 0:
335
+ return 1
336
+ data = data[: min(20000, len(data))]
337
+
338
+ base = 3
339
+ auto_corr = acf(data, nlags=400, fft=True)[base:]
340
+
341
+ # plot_acf(data, lags=400, fft=True)
342
+ # plt.xlabel('Lags')
343
+ # plt.ylabel('Autocorrelation')
344
+ # plt.title('Autocorrelation Function (ACF)')
345
+ # plt.savefig('/data/liuqinghua/code/ts/TSAD-AutoML/AutoAD_Solution/candidate_pool/cd_diagram/ts_acf.png')
346
+
347
+ local_max = argrelextrema(auto_corr, np.greater)[0]
348
+
349
+ # print('auto_corr: ', auto_corr)
350
+ # print('local_max: ', local_max)
351
+
352
+ try:
353
+ # max_local_max = np.argmax([auto_corr[lcm] for lcm in local_max])
354
+ sorted_local_max = np.argsort([auto_corr[lcm] for lcm in local_max])[::-1] # Ascending order
355
+ max_local_max = sorted_local_max[0] # Default
356
+ if rank == 1:
357
+ max_local_max = sorted_local_max[0]
358
+ if rank == 2:
359
+ for i in sorted_local_max[1:]:
360
+ if i > sorted_local_max[0]:
361
+ max_local_max = i
362
+ break
363
+ if rank == 3:
364
+ id_tmp = 1
365
+ for i in sorted_local_max[1:]:
366
+ if i > sorted_local_max[0]:
367
+ id_tmp = i
368
+ break
369
+ for i in sorted_local_max[id_tmp:]:
370
+ if i > sorted_local_max[id_tmp]:
371
+ max_local_max = i
372
+ break
373
+ # print('sorted_local_max: ', sorted_local_max)
374
+ # print('max_local_max: ', max_local_max)
375
+ if local_max[max_local_max] < 3 or local_max[max_local_max] > 300:
376
+ return 125
377
+ return local_max[max_local_max] + base
378
+ except Exception:
379
+ return 125
evaluation/visualize.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from basic_metrics import metricor
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ import matplotlib.patches as mpatches
5
+
6
+ def plotFig(data, label, score, slidingWindow, fileName, modelName, plotRange=None):
7
+ grader = metricor()
8
+
9
+ R_AUC, R_AP, R_fpr, R_tpr, R_prec = grader.RangeAUC(labels=label, score=score, window=slidingWindow, plot_ROC=True) #
10
+
11
+ L, fpr, tpr= grader.metric_new(label, score, plot_ROC=True)
12
+ precision, recall, AP = grader.metric_PR(label, score)
13
+
14
+ range_anomaly = grader.range_convers_new(label)
15
+ # print(range_anomaly)
16
+
17
+ # max_length = min(len(score),len(data), 20000)
18
+ max_length = len(score)
19
+
20
+ if plotRange==None:
21
+ plotRange = [0,max_length]
22
+
23
+ fig3 = plt.figure(figsize=(12, 10), constrained_layout=True)
24
+ gs = fig3.add_gridspec(3, 4)
25
+
26
+
27
+ f3_ax1 = fig3.add_subplot(gs[0, :-1])
28
+ plt.tick_params(labelbottom=False)
29
+
30
+ plt.plot(data[:max_length],'k')
31
+ for r in range_anomaly:
32
+ if r[0]==r[1]:
33
+ plt.plot(r[0],data[r[0]],'r.')
34
+ else:
35
+ plt.plot(range(r[0],r[1]+1),data[range(r[0],r[1]+1)],'r')
36
+ # plt.xlim([0,max_length])
37
+ plt.xlim(plotRange)
38
+
39
+
40
+ # L = [auc, precision, recall, f, Rrecall, ExistenceReward,
41
+ # OverlapReward, Rprecision, Rf, precision_at_k]
42
+ f3_ax2 = fig3.add_subplot(gs[1, :-1])
43
+ # plt.tick_params(labelbottom=False)
44
+ L1 = [ '%.2f' % elem for elem in L]
45
+ plt.plot(score[:max_length])
46
+ plt.hlines(np.mean(score)+3*np.std(score),0,max_length,linestyles='--',color='red')
47
+ plt.ylabel('score')
48
+ # plt.xlim([0,max_length])
49
+ plt.xlim(plotRange)
50
+
51
+
52
+ #plot the data
53
+ f3_ax3 = fig3.add_subplot(gs[2, :-1])
54
+ index = ( label + 2*(score > (np.mean(score)+3*np.std(score))))
55
+ cf = lambda x: 'k' if x==0 else ('r' if x == 1 else ('g' if x == 2 else 'b') )
56
+ cf = np.vectorize(cf)
57
+
58
+ color = cf(index[:max_length])
59
+ black_patch = mpatches.Patch(color = 'black', label = 'TN')
60
+ red_patch = mpatches.Patch(color = 'red', label = 'FN')
61
+ green_patch = mpatches.Patch(color = 'green', label = 'FP')
62
+ blue_patch = mpatches.Patch(color = 'blue', label = 'TP')
63
+ plt.scatter(np.arange(max_length), data[:max_length], c=color, marker='.')
64
+ plt.legend(handles = [black_patch, red_patch, green_patch, blue_patch], loc= 'best')
65
+ # plt.xlim([0,max_length])
66
+ plt.xlim(plotRange)
67
+
68
+
69
+ f3_ax4 = fig3.add_subplot(gs[0, -1])
70
+ plt.plot(fpr, tpr)
71
+ # plt.plot(R_fpr,R_tpr)
72
+ # plt.title('R_AUC='+str(round(R_AUC,3)))
73
+ plt.xlabel('FPR')
74
+ plt.ylabel('TPR')
75
+ # plt.legend(['ROC','Range-ROC'])
76
+
77
+ # f3_ax5 = fig3.add_subplot(gs[1, -1])
78
+ # plt.plot(recall, precision)
79
+ # plt.plot(R_tpr[:-1],R_prec) # I add (1,1) to (TPR, FPR) at the end !!!
80
+ # plt.xlabel('Recall')
81
+ # plt.ylabel('Precision')
82
+ # plt.legend(['PR','Range-PR'])
83
+
84
+ # print('AUC=', L1[0])
85
+ # print('F=', L1[3])
86
+
87
+ plt.suptitle(fileName + ' window='+str(slidingWindow) +' '+ modelName
88
+ +'\nAUC='+L1[0]+' R_AUC='+str(round(R_AUC,2))+' Precision='+L1[1]+ ' Recall='+L1[2]+' F='+L1[3]
89
+ + ' ExistenceReward='+L1[5]+' OverlapReward='+L1[6]
90
+ +'\nAP='+str(round(AP,2))+' R_AP='+str(round(R_AP,2))+' Precision@k='+L1[9]+' Rprecision='+L1[7] + ' Rrecall='+L1[4] +' Rf='+L1[8]
91
+ )
92
+
93
+ def printResult(data, label, score, slidingWindow, fileName, modelName):
94
+ grader = metricor()
95
+ R_AUC = grader.RangeAUC(labels=label, score=score, window=slidingWindow, plot_ROC=False) #
96
+ L= grader.metric_new(label, score, plot_ROC=False)
97
+ L.append(R_AUC)
98
+ return L
99
+
model_wrapper.py ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import math
3
+ from utils.slidingWindows import find_length_rank
4
+
5
+ Unsupervise_AD_Pool = ['FFT', 'SR', 'NORMA', 'Series2Graph', 'Sub_IForest', 'IForest', 'LOF', 'Sub_LOF', 'POLY', 'MatrixProfile', 'Sub_PCA', 'PCA', 'HBOS',
6
+ 'Sub_HBOS', 'KNN', 'Sub_KNN','KMeansAD', 'KMeansAD_U', 'KShapeAD', 'COPOD', 'CBLOF', 'COF', 'EIF', 'RobustPCA', 'Lag_Llama',
7
+ 'TimesFM', 'Chronos', 'MOMENT_ZS', 'DADA', 'Time_MOE', 'Time_RCD', 'TSPulse']
8
+ Semisupervise_AD_Pool = ['Left_STAMPi', 'SAND', 'MCD', 'Sub_MCD', 'OCSVM', 'Sub_OCSVM', 'AutoEncoder', 'CNN', 'LSTMAD', 'TranAD', 'USAD', 'OmniAnomaly',
9
+ 'AnomalyTransformer', 'TimesNet', 'FITS', 'Donut', 'OFA', 'MOMENT_FT', 'M2N2', ]
10
+
11
+ def run_Unsupervise_AD(model_name, training_data, testing_data, **kwargs):
12
+ # Extract data_index if present, but don't pass it to all functions
13
+ data_index = kwargs.pop('data_index', None)
14
+
15
+ function_name = f'run_{model_name}'
16
+ function_to_call = globals()[function_name]
17
+
18
+
19
+ # Only pass data_index to functions that need it
20
+ if 'Reconstruction' in model_name:
21
+ results = function_to_call(data, data_index, **kwargs)
22
+ else:
23
+ results = function_to_call(testing_data, **kwargs)
24
+
25
+ return results
26
+
27
+ def run_Semisupervise_AD(model_name, data_train, data_test, **kwargs):
28
+ try:
29
+ function_name = f'run_{model_name}'
30
+ function_to_call = globals()[function_name]
31
+ results = function_to_call(data_train, data_test, **kwargs)
32
+ return results
33
+ except KeyError:
34
+ error_message = f"Model function '{function_name}' is not defined."
35
+ print(error_message)
36
+ return error_message
37
+ except Exception as e:
38
+ error_message = f"An error occurred while running the model '{function_name}': {str(e)}"
39
+ print(error_message)
40
+ return error_message
41
+
42
+ def run_FFT(data, ifft_parameters=5, local_neighbor_window=21, local_outlier_threshold=0.6, max_region_size=50, max_sign_change_distance=10):
43
+ from models.FFT import FFT
44
+ clf = FFT(ifft_parameters=ifft_parameters, local_neighbor_window=local_neighbor_window, local_outlier_threshold=local_outlier_threshold, max_region_size=max_region_size, max_sign_change_distance=max_sign_change_distance)
45
+ clf.fit(data)
46
+ score = clf.decision_scores_
47
+ return score.ravel()
48
+
49
+ def run_Sub_IForest(data, periodicity=1, n_estimators=100, max_features=1, n_jobs=1):
50
+ from models.IForest import IForest
51
+ slidingWindow = find_length_rank(data, rank=periodicity)
52
+ clf = IForest(slidingWindow=slidingWindow, n_estimators=n_estimators, max_features=max_features, n_jobs=n_jobs)
53
+ clf.fit(data)
54
+ score = clf.decision_scores_
55
+ return score.ravel()
56
+
57
+ def run_IForest(train_data, test_data, slidingWindow=100, n_estimators=100, max_features=1, n_jobs=1):
58
+ from models.IForest import IForest
59
+ clf = IForest(slidingWindow=slidingWindow, n_estimators=n_estimators, max_features=max_features, n_jobs=n_jobs)
60
+ clf.fit(train_data)
61
+ score = clf.decision_function(test_data)
62
+ # score = clf.decision_scores_
63
+ return score.ravel()
64
+
65
+ def run_Sub_LOF(data, periodicity=1, n_neighbors=30, metric='minkowski', n_jobs=1):
66
+ from models.LOF import LOF
67
+ slidingWindow = find_length_rank(data, rank=periodicity)
68
+ clf = LOF(slidingWindow=slidingWindow, n_neighbors=n_neighbors, metric=metric, n_jobs=n_jobs)
69
+ clf.fit(data)
70
+ score = clf.decision_scores_
71
+ return score.ravel()
72
+
73
+ def run_LOF(train_data, test_data, slidingWindow=1, n_neighbors=30, metric='minkowski', n_jobs=1):
74
+ from models.LOF import LOF
75
+ clf = LOF(slidingWindow=slidingWindow, n_neighbors=n_neighbors, metric=metric, n_jobs=n_jobs)
76
+ clf.fit(train_data)
77
+ score = clf.decision_function(test_data)
78
+ return score.ravel()
79
+
80
+ def run_POLY(data, periodicity=1, power=3, n_jobs=1):
81
+ from models.POLY import POLY
82
+ slidingWindow = find_length_rank(data, rank=periodicity)
83
+ clf = POLY(power=power, window = slidingWindow)
84
+ clf.fit(data)
85
+ score = clf.decision_scores_
86
+ return score.ravel()
87
+
88
+ def run_MatrixProfile(data, periodicity=1, n_jobs=1):
89
+ from models.MatrixProfile import MatrixProfile
90
+ slidingWindow = find_length_rank(data, rank=periodicity)
91
+ clf = MatrixProfile(window=slidingWindow)
92
+ clf.fit(data)
93
+ score = clf.decision_scores_
94
+ return score.ravel()
95
+
96
+ def run_Left_STAMPi(data_train, data):
97
+ from models.Left_STAMPi import Left_STAMPi
98
+ clf = Left_STAMPi(n_init_train=len(data_train), window_size=100)
99
+ clf.fit(data)
100
+ score = clf.decision_function(data)
101
+ return score.ravel()
102
+
103
+ def run_SAND(data_train, data_test, periodicity=1):
104
+ from models.SAND import SAND
105
+ slidingWindow = find_length_rank(data_test, rank=periodicity)
106
+ clf = SAND(pattern_length=slidingWindow, subsequence_length=4*(slidingWindow))
107
+ clf.fit(data_test.squeeze(), online=True, overlaping_rate=int(1.5*slidingWindow), init_length=len(data_train), alpha=0.5, batch_size=max(5*(slidingWindow), int(0.1*len(data_test))))
108
+ score = clf.decision_scores_
109
+ return score.ravel()
110
+
111
+ def run_KShapeAD(data, periodicity=1):
112
+ from models.SAND import SAND
113
+ slidingWindow = find_length_rank(data, rank=periodicity)
114
+ clf = SAND(pattern_length=slidingWindow, subsequence_length=4*(slidingWindow))
115
+ clf.fit(data.squeeze(), overlaping_rate=int(1.5*slidingWindow))
116
+ score = clf.decision_scores_
117
+ return score.ravel()
118
+
119
+ def run_Series2Graph(data, periodicity=1):
120
+ from models.Series2Graph import Series2Graph
121
+ slidingWindow = find_length_rank(data, rank=periodicity)
122
+
123
+ data = data.squeeze()
124
+ s2g = Series2Graph(pattern_length=slidingWindow)
125
+ s2g.fit(data)
126
+ query_length = 2*slidingWindow
127
+ s2g.score(query_length=query_length,dataset=data)
128
+
129
+ score = s2g.decision_scores_
130
+ score = np.array([score[0]]*math.ceil(query_length//2) + list(score) + [score[-1]]*(query_length//2))
131
+ return score.ravel()
132
+
133
+ def run_Sub_PCA(train_data, test_data, periodicity=1, n_components=None, n_jobs=1):
134
+ from models.PCA import PCA
135
+ slidingWindow = find_length_rank(train_data, rank=periodicity)
136
+ clf = PCA(slidingWindow = slidingWindow, n_components=n_components)
137
+ clf.fit(train_data)
138
+ score = clf.decision_function(test_data)
139
+ return score.ravel()
140
+
141
+ def run_PCA(train_data, test_data, slidingWindow=100, n_components=None, n_jobs=1):
142
+ from models.PCA import PCA
143
+ clf = PCA(slidingWindow = slidingWindow, n_components=n_components)
144
+ clf.fit(train_data)
145
+ score = clf.decision_function(test_data)
146
+ return score.ravel()
147
+
148
+ def run_NORMA(data, periodicity=1, clustering='hierarchical', n_jobs=1):
149
+ from models.NormA import NORMA
150
+ slidingWindow = find_length_rank(data, rank=periodicity)
151
+ clf = NORMA(pattern_length=slidingWindow, nm_size=3*slidingWindow, clustering=clustering)
152
+ clf.fit(data)
153
+ score = clf.decision_scores_
154
+ score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
155
+ if len(score) > len(data):
156
+ start = len(score) - len(data)
157
+ score = score[start:]
158
+ return score.ravel()
159
+
160
+ def run_Sub_HBOS(data, periodicity=1, n_bins=10, tol=0.5, n_jobs=1):
161
+ from models.HBOS import HBOS
162
+ slidingWindow = find_length_rank(data, rank=periodicity)
163
+ clf = HBOS(slidingWindow=slidingWindow, n_bins=n_bins, tol=tol)
164
+ clf.fit(data)
165
+ score = clf.decision_scores_
166
+ return score.ravel()
167
+
168
+ def run_HBOS(data, slidingWindow=1, n_bins=10, tol=0.5, n_jobs=1):
169
+ from models.HBOS import HBOS
170
+ clf = HBOS(slidingWindow=slidingWindow, n_bins=n_bins, tol=tol)
171
+ clf.fit(data)
172
+ score = clf.decision_scores_
173
+ return score.ravel()
174
+
175
+ def run_Sub_OCSVM(data_train, data_test, kernel='rbf', nu=0.5, periodicity=1, n_jobs=1):
176
+ from models.OCSVM import OCSVM
177
+ slidingWindow = find_length_rank(data_test, rank=periodicity)
178
+ clf = OCSVM(slidingWindow=slidingWindow, kernel=kernel, nu=nu)
179
+ clf.fit(data_train)
180
+ score = clf.decision_function(data_test)
181
+ return score.ravel()
182
+
183
+ def run_OCSVM(data_train, data_test, kernel='rbf', nu=0.5, slidingWindow=1, n_jobs=1):
184
+ from models.OCSVM import OCSVM
185
+ clf = OCSVM(slidingWindow=slidingWindow, kernel=kernel, nu=nu)
186
+ clf.fit(data_train)
187
+ score = clf.decision_function(data_test)
188
+ return score.ravel()
189
+
190
+ def run_Sub_MCD(data_train, data_test, support_fraction=None, periodicity=1, n_jobs=1):
191
+ from models.MCD import MCD
192
+ slidingWindow = find_length_rank(data_test, rank=periodicity)
193
+ clf = MCD(slidingWindow=slidingWindow, support_fraction=support_fraction)
194
+ clf.fit(data_train)
195
+ score = clf.decision_function(data_test)
196
+ return score.ravel()
197
+
198
+ def run_MCD(data_train, data_test, support_fraction=None, slidingWindow=1, n_jobs=1):
199
+ from models.MCD import MCD
200
+ clf = MCD(slidingWindow=slidingWindow, support_fraction=support_fraction)
201
+ clf.fit(data_train)
202
+ score = clf.decision_function(data_test)
203
+ return score.ravel()
204
+
205
+ def run_Sub_KNN(data, n_neighbors=10, method='largest', periodicity=1, n_jobs=1):
206
+ from models.KNN import KNN
207
+ slidingWindow = find_length_rank(data, rank=periodicity)
208
+ clf = KNN(slidingWindow=slidingWindow, n_neighbors=n_neighbors,method=method, n_jobs=n_jobs)
209
+ clf.fit(data)
210
+ score = clf.decision_scores_
211
+ return score.ravel()
212
+
213
+ def run_KNN(data, slidingWindow=1, n_neighbors=10, method='largest', n_jobs=1):
214
+ from models.KNN import KNN
215
+ clf = KNN(slidingWindow=slidingWindow, n_neighbors=n_neighbors, method=method, n_jobs=n_jobs)
216
+ clf.fit(data)
217
+ score = clf.decision_scores_
218
+ return score.ravel()
219
+
220
+ def run_KMeansAD(data, n_clusters=20, window_size=20, n_jobs=1):
221
+ from models.KMeansAD import KMeansAD
222
+ clf = KMeansAD(k=n_clusters, window_size=window_size, stride=1, n_jobs=n_jobs)
223
+ score = clf.fit_predict(data)
224
+ return score.ravel()
225
+
226
+ def run_KMeansAD_U(data, n_clusters=20, periodicity=1,n_jobs=1):
227
+ from models.KMeansAD import KMeansAD
228
+ slidingWindow = find_length_rank(data, rank=periodicity)
229
+ clf = KMeansAD(k=n_clusters, window_size=slidingWindow, stride=1, n_jobs=n_jobs)
230
+ score = clf.fit_predict(data)
231
+ return score.ravel()
232
+
233
+ def run_COPOD(data, n_jobs=1):
234
+ from models.COPOD import COPOD
235
+ clf = COPOD(n_jobs=n_jobs)
236
+ clf.fit(data)
237
+ score = clf.decision_scores_
238
+ return score.ravel()
239
+
240
+ def run_CBLOF(data, n_clusters=8, alpha=0.9, n_jobs=1):
241
+ from models.CBLOF import CBLOF
242
+ clf = CBLOF(n_clusters=n_clusters, alpha=alpha, n_jobs=n_jobs)
243
+ clf.fit(data)
244
+ score = clf.decision_scores_
245
+ return score.ravel()
246
+
247
+ def run_COF(data, n_neighbors=30):
248
+ from models.COF import COF
249
+ clf = COF(n_neighbors=n_neighbors)
250
+ clf.fit(data)
251
+ score = clf.decision_scores_
252
+ return score.ravel()
253
+
254
+ def run_EIF(data, n_trees=100):
255
+ from models.EIF import EIF
256
+ clf = EIF(n_trees=n_trees)
257
+ clf.fit(data)
258
+ score = clf.decision_scores_
259
+ return score.ravel()
260
+
261
+ def run_RobustPCA(data, max_iter=1000):
262
+ from models.RobustPCA import RobustPCA
263
+ clf = RobustPCA(max_iter=max_iter)
264
+ clf.fit(data)
265
+ score = clf.decision_scores_
266
+ return score.ravel()
267
+
268
+ def run_SR(data, periodicity=1):
269
+ from models.SR import SR
270
+ slidingWindow = find_length_rank(data, rank=periodicity)
271
+ return SR(data, window_size=slidingWindow)
272
+
273
+ def run_AutoEncoder(data_train, data_test, window_size=100, hidden_neurons=[64, 32], n_jobs=1):
274
+ from models.AE import AutoEncoder
275
+ clf = AutoEncoder(slidingWindow=window_size, hidden_neurons=hidden_neurons, batch_size=128, epochs=50)
276
+ clf.fit(data_train)
277
+ score = clf.decision_function(data_test)
278
+ return score.ravel()
279
+
280
+ def run_CNN(data_train, data_test, window_size=100, num_channel=[32, 32, 40], lr=0.0008, n_jobs=1):
281
+ from models.CNN import CNN
282
+ clf = CNN(window_size=window_size, num_channel=num_channel, feats=data_test.shape[1], lr=lr, batch_size=128)
283
+ clf.fit(data_train)
284
+ score = clf.decision_function(data_test)
285
+ return score.ravel()
286
+
287
+ def run_LSTMAD(data_train, data_test, window_size=100, lr=0.0008):
288
+ from models.LSTMAD import LSTMAD
289
+ clf = LSTMAD(window_size=window_size, pred_len=1, lr=lr, feats=data_test.shape[1], batch_size=128)
290
+ clf.fit(data_train)
291
+ score = clf.decision_function(data_test)
292
+ return score.ravel()
293
+
294
+ def run_TranAD(data_train, data_test, win_size=10, lr=1e-3):
295
+ from models.TranAD import TranAD
296
+ clf = TranAD(win_size=win_size, feats=data_test.shape[1], lr=lr)
297
+ clf.fit(data_train)
298
+ score = clf.decision_function(data_test)
299
+ return score.ravel()
300
+
301
+ def run_AnomalyTransformer(data_train, data_test, win_size=100, lr=1e-4, batch_size=128):
302
+ from models.AnomalyTransformer import AnomalyTransformer
303
+ clf = AnomalyTransformer(win_size=win_size, input_c=data_test.shape[1], lr=lr, batch_size=batch_size)
304
+ clf.fit(data_train)
305
+ score = clf.decision_function(data_test)
306
+ return score.ravel()
307
+
308
+ def run_OmniAnomaly(data_train, data_test, win_size=100, lr=0.002):
309
+ from models.OmniAnomaly import OmniAnomaly
310
+ clf = OmniAnomaly(win_size=win_size, feats=data_test.shape[1], lr=lr)
311
+ clf.fit(data_train)
312
+ score = clf.decision_function(data_test)
313
+ return score.ravel()
314
+
315
+ def run_USAD(data_train, data_test, win_size=5, lr=1e-4):
316
+ from models.USAD import USAD
317
+ clf = USAD(win_size=win_size, feats=data_test.shape[1], lr=lr)
318
+ clf.fit(data_train)
319
+ score = clf.decision_function(data_test)
320
+ return score.ravel()
321
+
322
+ def run_Donut(data_train, data_test, win_size=120, lr=1e-4, batch_size=128):
323
+ from models.Donut import Donut
324
+ clf = Donut(win_size=win_size, input_c=data_test.shape[1], lr=lr, batch_size=batch_size)
325
+ clf.fit(data_train)
326
+ score = clf.decision_function(data_test)
327
+ return score.ravel()
328
+
329
+ def run_TimesNet(data_train, data_test, win_size=96, lr=1e-4):
330
+ from models.TimesNet import TimesNet
331
+ clf = TimesNet(win_size=win_size, enc_in=data_test.shape[1], lr=lr, epochs=50)
332
+ clf.fit(data_train)
333
+ score = clf.decision_function(data_test)
334
+ return score.ravel()
335
+
336
+ def run_FITS(data_train, data_test, win_size=100, lr=1e-3):
337
+ from models.FITS import FITS
338
+ clf = FITS(win_size=win_size, input_c=data_test.shape[1], lr=lr, batch_size=128)
339
+ clf.fit(data_train)
340
+ score = clf.decision_function(data_test)
341
+ return score.ravel()
342
+
343
+ def run_OFA(data_train, data_test, win_size=100, batch_size = 64):
344
+ from models.OFA import OFA
345
+ clf = OFA(win_size=win_size, enc_in=data_test.shape[1], epochs=10, batch_size=batch_size)
346
+ clf.fit(data_train)
347
+ score = clf.decision_function(data_test)
348
+ return score.ravel()
349
+
350
+ def run_Lag_Llama(data, win_size=96, batch_size=64):
351
+ from models.Lag_Llama import Lag_Llama
352
+ clf = Lag_Llama(win_size=win_size, input_c=data.shape[1], batch_size=batch_size)
353
+ clf.fit(data)
354
+ score = clf.decision_scores_
355
+ return score.ravel()
356
+
357
+ def run_Chronos(data, win_size=50, batch_size=64):
358
+ from models.Chronos import Chronos
359
+ clf = Chronos(win_size=win_size, prediction_length=1, input_c=1, model_size='base', batch_size=batch_size)
360
+ data =data.reshape(-1,1)
361
+ clf.fit(data)
362
+ score = clf.decision_scores_
363
+ return score.ravel()
364
+
365
+ def run_TimesFM(data, win_size=96):
366
+ from models.TimesFM import TimesFM
367
+ clf = TimesFM(win_size=win_size)
368
+ data_normalized = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
369
+ data_normalized = data_normalized.reshape(-1,1)
370
+ clf.fit(data_normalized)
371
+ #normalizd data:
372
+ score = clf.decision_scores_
373
+ return score.ravel()
374
+
375
+ def run_MOMENT_ZS(data, win_size=256):
376
+ from models.MOMENT import MOMENT
377
+ clf = MOMENT(win_size=win_size, input_c=1)
378
+ data = data.reshape(-1,1)
379
+ # Zero shot
380
+ clf.zero_shot(data)
381
+ score = clf.decision_scores_
382
+ return score.ravel()
383
+
384
+ def run_MOMENT_FT(data_train, data_test, win_size=256):
385
+ from models.MOMENT import MOMENT
386
+ clf = MOMENT(win_size=win_size, input_c=data_test.shape[1])
387
+
388
+ # Finetune
389
+ clf.fit(data_train)
390
+ score = clf.decision_function(data_test)
391
+ return score.ravel()
392
+
393
+ def run_M2N2(
394
+ data_train, data_test, win_size=12, stride=12,
395
+ batch_size=64, epochs=100, latent_dim=16,
396
+ lr=1e-3, ttlr=1e-3, normalization='Detrend',
397
+ gamma=0.99, th=0.9, valid_size=0.2, infer_mode='online'
398
+ ):
399
+ from models.M2N2 import M2N2
400
+ clf = M2N2(
401
+ win_size=win_size, stride=stride,
402
+ num_channels=data_test.shape[1],
403
+ batch_size=batch_size, epochs=epochs,
404
+ latent_dim=latent_dim,
405
+ lr=lr, ttlr=ttlr,
406
+ normalization=normalization,
407
+ gamma=gamma, th=th, valid_size=valid_size,
408
+ infer_mode=infer_mode
409
+ )
410
+ clf.fit(data_train)
411
+ score = clf.decision_function(data_test)
412
+ return score.ravel()
413
+
414
+ def run_DADA(data_test, device=0, win_size=100, batch_size=32):
415
+ from models.DADA import DADA
416
+ clf = DADA(device=device, win_size=win_size, batch_size=batch_size)
417
+ score = clf.zero_shot(data_test)
418
+ return score.ravel()
419
+
420
+ def run_Time_MOE(data, device=0, win_size=64, batch_size=32):
421
+ from models.time_moe import Time_MOE
422
+ clf = Time_MOE(device=device, win_size=win_size, batch_size=batch_size)
423
+ score = clf.zero_shot(data)
424
+ return score.ravel()
425
+
426
+ def run_Time_RCD(data, **kwargs):
427
+ Multi = kwargs.get('Multi', False)
428
+ win_size = kwargs.get('win_size', 5000)
429
+ batch_size = kwargs.get('batch_size', 64)
430
+ random_mask = kwargs.get('random_mask', 'random_mask')
431
+ size = kwargs.get('size', 'full')
432
+ device = kwargs.get('device', '2') # Extract device parameter
433
+ """
434
+ Wrapper function for Time_RCD model
435
+ """
436
+ from models.TimeRCD import TimeRCDPretrainTester
437
+ from models.time_rcd.time_rcd_config import TimeRCDConfig, default_config
438
+
439
+ config = default_config
440
+ if Multi:
441
+ if size == 'small':
442
+ if random_mask == 'random_mask':
443
+ checkpoint_path = 'checkpoints/dataset_10_20.pth'
444
+ else:
445
+ checkpoint_path = 'checkpoints/full_mask_10_20.pth'
446
+ config.ts_config.patch_size = 16
447
+ else:
448
+ if random_mask == 'random_mask':
449
+ checkpoint_path = 'checkpoints/dataset_15_56.pth'
450
+ else:
451
+ checkpoint_path = 'checkpoints/full_mask_15_56.pth'
452
+ config.ts_config.patch_size = 32
453
+ else:
454
+ checkpoint_path = 'checkpoints/full_mask_anomaly_head_pretrain_checkpoint_best.pth'
455
+ config.ts_config.patch_size = 16
456
+
457
+ config.cuda_devices = device # Use the device parameter properly
458
+ print("Using CUDA device:", config.cuda_devices)
459
+ config.win_size = win_size
460
+ config.batch_size = batch_size
461
+ config.ts_config.num_features = data.shape[1]
462
+ print(f"Checkpoint path: {checkpoint_path}")
463
+ cls = TimeRCDPretrainTester(checkpoint_path, config)
464
+ score_list, logit_list = cls.zero_shot(data)
465
+
466
+ # Concatenate across batches robustly to avoid inhomogeneous shape errors
467
+ score = np.concatenate([np.asarray(s).reshape(-1) for s in score_list], axis=0)
468
+ logit = np.concatenate([np.asarray(l).reshape(-1) for l in logit_list], axis=0)
469
+
470
+ return score, logit
471
+
472
+
473
+ def run_TSPulse(data, win_size=256, batch_size=64, prediction_mode=None, aggregation_length=64,
474
+ aggr_function="max", smoothing_length=8, least_significant_scale=0.01,
475
+ least_significant_score=0.1, device=None):
476
+ """
477
+ Wrapper function for TSPulse anomaly detection model
478
+
479
+ Parameters
480
+ ----------
481
+ data : numpy.ndarray
482
+ Time series data of shape (n_samples, n_features)
483
+ win_size : int, default=256
484
+ Window size (for compatibility, not directly used by TSPulse)
485
+ batch_size : int, default=64
486
+ Batch size for processing
487
+ prediction_mode : list, optional
488
+ List of prediction modes. If None, uses default time and frequency reconstruction
489
+ aggregation_length : int, default=64
490
+ Length for aggregation of scores
491
+ aggr_function : str, default="max"
492
+ Aggregation function ("max", "mean", "median")
493
+ smoothing_length : int, default=8
494
+ Length for smoothing the anomaly scores
495
+ least_significant_scale : float, default=0.01
496
+ Minimum scale for significance
497
+ least_significant_score : float, default=0.1
498
+ Minimum score for significance
499
+ device : str, optional
500
+ Device to use ("cuda" or "cpu"). Auto-detected if None.
501
+
502
+ Returns
503
+ -------
504
+ numpy.ndarray
505
+ Anomaly scores of shape (n_samples,)
506
+ """
507
+ from models.TSPulse import run_TSPulse as tspulse_runner
508
+
509
+ # Prepare kwargs for TSPulse
510
+ kwargs = {
511
+ 'batch_size': batch_size,
512
+ 'aggregation_length': aggregation_length,
513
+ 'aggr_function': aggr_function,
514
+ 'smoothing_length': smoothing_length,
515
+ 'least_significant_scale': least_significant_scale,
516
+ 'least_significant_score': least_significant_score,
517
+ }
518
+
519
+ if prediction_mode is not None:
520
+ kwargs['prediction_mode'] = prediction_mode
521
+ if device is not None:
522
+ kwargs['device'] = device
523
+
524
+ try:
525
+ # Run TSPulse anomaly detection
526
+ score = tspulse_runner(data, **kwargs)
527
+ return score.ravel()
528
+ except Exception as e:
529
+ print(f"Warning: TSPulse failed with error: {str(e)}")
530
+ print("Falling back to random scores")
531
+ # Return random scores as fallback
532
+ return np.random.random(len(data)) * 0.1
models/.DS_Store ADDED
Binary file (6.15 kB). View file
 
models/AE.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This function is adapted from [pyod] by [yzhao062]
3
+ Original source: [https://github.com/yzhao062/pyod]
4
+ """
5
+
6
+ from __future__ import division
7
+ from __future__ import print_function
8
+
9
+ import numpy as np
10
+ import torch, math
11
+ from sklearn.utils import check_array
12
+ from sklearn.utils.validation import check_is_fitted
13
+ from torch import nn
14
+ from sklearn.preprocessing import MinMaxScaler
15
+
16
+ from .feature import Window
17
+ from .base import BaseDetector
18
+ from ..utils.stat_models import pairwise_distances_no_broadcast
19
+ from ..utils.dataset import TSDataset
20
+ from ..utils.utility import get_activation_by_name
21
+
22
+ class InnerAutoencoder(nn.Module):
23
+ def __init__(self,
24
+ n_features,
25
+ hidden_neurons=(128, 64),
26
+ dropout_rate=0.2,
27
+ batch_norm=True,
28
+ hidden_activation='relu'):
29
+
30
+ # initialize the super class
31
+ super(InnerAutoencoder, self).__init__()
32
+
33
+ # save the default values
34
+ self.n_features = n_features
35
+ self.dropout_rate = dropout_rate
36
+ self.batch_norm = batch_norm
37
+ self.hidden_activation = hidden_activation
38
+
39
+ # create the dimensions for the input and hidden layers
40
+ self.layers_neurons_encoder_ = [self.n_features, *hidden_neurons]
41
+ self.layers_neurons_decoder_ = self.layers_neurons_encoder_[::-1]
42
+
43
+ # get the object for the activations functions
44
+ self.activation = get_activation_by_name(hidden_activation)
45
+
46
+ # initialize encoder and decoder as a sequential
47
+ self.encoder = nn.Sequential()
48
+ self.decoder = nn.Sequential()
49
+
50
+ # fill the encoder sequential with hidden layers
51
+ for idx, layer in enumerate(self.layers_neurons_encoder_[:-1]):
52
+
53
+ # create a linear layer of neurons
54
+ self.encoder.add_module(
55
+ "linear" + str(idx),
56
+ torch.nn.Linear(layer,self.layers_neurons_encoder_[idx + 1]))
57
+
58
+ # add a batch norm per layer if wanted (leave out first layer)
59
+ if batch_norm:
60
+ self.encoder.add_module("batch_norm" + str(idx),
61
+ nn.BatchNorm1d(self.layers_neurons_encoder_[idx + 1]))
62
+
63
+ # create the activation
64
+ self.encoder.add_module(self.hidden_activation + str(idx),
65
+ self.activation)
66
+
67
+ # create a dropout layer
68
+ self.encoder.add_module("dropout" + str(idx),
69
+ torch.nn.Dropout(dropout_rate))
70
+
71
+ # fill the decoder layer
72
+ for idx, layer in enumerate(self.layers_neurons_decoder_[:-1]):
73
+
74
+ # create a linear layer of neurons
75
+ self.decoder.add_module(
76
+ "linear" + str(idx),
77
+ torch.nn.Linear(layer,self.layers_neurons_decoder_[idx + 1]))
78
+
79
+ # create a batch norm per layer if wanted (only if it is not the
80
+ # last layer)
81
+ if batch_norm and idx < len(self.layers_neurons_decoder_[:-1]) - 1:
82
+ self.decoder.add_module("batch_norm" + str(idx),
83
+ nn.BatchNorm1d(self.layers_neurons_decoder_[idx + 1]))
84
+
85
+ # create the activation
86
+ self.decoder.add_module(self.hidden_activation + str(idx),
87
+ self.activation)
88
+
89
+ # create a dropout layer (only if it is not the last layer)
90
+ if idx < len(self.layers_neurons_decoder_[:-1]) - 1:
91
+ self.decoder.add_module("dropout" + str(idx),
92
+ torch.nn.Dropout(dropout_rate))
93
+
94
+ def forward(self, x):
95
+ # we could return the latent representation here after the encoder
96
+ # as the latent representation
97
+ x = self.encoder(x)
98
+ x = self.decoder(x)
99
+ return x
100
+
101
+ class AutoEncoder(BaseDetector):
102
+ """Auto Encoder (AE) is a type of neural networks for learning useful data
103
+ representations in an unsupervised manner. Similar to PCA, AE could be used
104
+ to detect outlying objects in the data by calculating the reconstruction
105
+ errors. See :cite:`aggarwal2015outlier` Chapter 3 for details.
106
+
107
+ Notes
108
+ -----
109
+ This is the PyTorch version of AutoEncoder.
110
+ The documentation is not finished!
111
+
112
+ Parameters
113
+ ----------
114
+ hidden_neurons : list, optional (default=[64, 32])
115
+ The number of neurons per hidden layers. So the network has the
116
+ structure as [n_features, 64, 32, 32, 64, n_features]
117
+
118
+ hidden_activation : str, optional (default='relu')
119
+ Activation function to use for hidden layers.
120
+ All hidden layers are forced to use the same type of activation.
121
+ See https://pytorch.org/docs/stable/nn.html for details.
122
+
123
+ batch_norm : boolean, optional (default=True)
124
+ Whether to apply Batch Normalization,
125
+ See https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html
126
+
127
+ learning_rate : float, optional (default=1e-3)
128
+ Learning rate for the optimizer. This learning_rate is given to
129
+ an Adam optimizer (torch.optim.Adam).
130
+ See https://pytorch.org/docs/stable/generated/torch.optim.Adam.html
131
+
132
+ epochs : int, optional (default=100)
133
+ Number of epochs to train the model.
134
+
135
+ batch_size : int, optional (default=32)
136
+ Number of samples per gradient update.
137
+
138
+ dropout_rate : float in (0., 1), optional (default=0.2)
139
+ The dropout to be used across all layers.
140
+
141
+ weight_decay : float, optional (default=1e-5)
142
+ The weight decay for Adam optimizer.
143
+ See https://pytorch.org/docs/stable/generated/torch.optim.Adam.html
144
+
145
+ preprocessing : bool, optional (default=True)
146
+ If True, apply standardization on the data.
147
+
148
+ loss_fn : obj, optional (default=torch.nn.MSELoss)
149
+ Optimizer instance which implements torch.nn._Loss.
150
+ One of https://pytorch.org/docs/stable/nn.html#loss-functions
151
+ or a custom loss. Custom losses are currently unstable.
152
+
153
+ verbose : int, optional (default=1)
154
+ Verbosity mode.
155
+
156
+ - 0 = silent
157
+ - 1 = progress bar
158
+ - 2 = one line per epoch.
159
+
160
+ For verbose >= 1, model summary may be printed.
161
+ !CURRENTLY NOT SUPPORTED.!
162
+
163
+ random_state : random_state: int, RandomState instance or None, optional
164
+ (default=None)
165
+ If int, random_state is the seed used by the random
166
+ number generator; If RandomState instance, random_state is the random
167
+ number generator; If None, the random number generator is the
168
+ RandomState instance used by `np.random`.
169
+ !CURRENTLY NOT SUPPORTED.!
170
+
171
+ contamination : float in (0., 0.5), optional (default=0.1)
172
+ The amount of contamination of the data set, i.e.
173
+ the proportion of outliers in the data set. When fitting this is used
174
+ to define the threshold on the decision function.
175
+
176
+ Attributes
177
+ ----------
178
+ encoding_dim_ : int
179
+ The number of neurons in the encoding layer.
180
+
181
+ compression_rate_ : float
182
+ The ratio between the original feature and
183
+ the number of neurons in the encoding layer.
184
+
185
+ model_ : Keras Object
186
+ The underlying AutoEncoder in Keras.
187
+
188
+ history_: Keras Object
189
+ The AutoEncoder training history.
190
+
191
+ decision_scores_ : numpy array of shape (n_samples,)
192
+ The outlier scores of the training data.
193
+ The higher, the more abnormal. Outliers tend to have higher
194
+ scores. This value is available once the detector is
195
+ fitted.
196
+
197
+ threshold_ : float
198
+ The threshold is based on ``contamination``. It is the
199
+ ``n_samples * contamination`` most abnormal samples in
200
+ ``decision_scores_``. The threshold is calculated for generating
201
+ binary outlier labels.
202
+
203
+ labels_ : int, either 0 or 1
204
+ The binary labels of the training data. 0 stands for inliers
205
+ and 1 for outliers/anomalies. It is generated by applying
206
+ ``threshold_`` on ``decision_scores_``.
207
+ """
208
+
209
+ def __init__(self,
210
+ slidingWindow=100,
211
+ hidden_neurons=None,
212
+ hidden_activation='relu',
213
+ batch_norm=True,
214
+ learning_rate=1e-3,
215
+ epochs=100,
216
+ batch_size=32,
217
+ dropout_rate=0.2,
218
+ weight_decay=1e-5,
219
+ # validation_size=0.1,
220
+ preprocessing=True,
221
+ loss_fn=None,
222
+ verbose=False,
223
+ # random_state=None,
224
+ contamination=0.1,
225
+ device=None):
226
+ super(AutoEncoder, self).__init__(contamination=contamination)
227
+
228
+ # save the initialization values
229
+ self.slidingWindow = slidingWindow
230
+ self.hidden_neurons = hidden_neurons
231
+ self.hidden_activation = hidden_activation
232
+ self.batch_norm = batch_norm
233
+ self.learning_rate = learning_rate
234
+ self.epochs = epochs
235
+ self.batch_size = batch_size
236
+ self.dropout_rate = dropout_rate
237
+ self.weight_decay = weight_decay
238
+ self.preprocessing = preprocessing
239
+ self.loss_fn = loss_fn
240
+ self.verbose = verbose
241
+ self.device = device
242
+
243
+ # create default loss functions
244
+ if self.loss_fn is None:
245
+ self.loss_fn = torch.nn.MSELoss()
246
+
247
+ # create default calculation device (support GPU if available)
248
+ if self.device is None:
249
+ self.device = torch.device(
250
+ "cuda:0" if torch.cuda.is_available() else "cpu")
251
+
252
+ # default values for the amount of hidden neurons
253
+ if self.hidden_neurons is None:
254
+ self.hidden_neurons = [64, 32]
255
+
256
+ # noinspection PyUnresolvedReferences
257
+ def fit(self, X, y=None):
258
+ """Fit detector. y is ignored in unsupervised methods.
259
+
260
+ Parameters
261
+ ----------
262
+ X : numpy array of shape (n_samples, n_features)
263
+ The input samples.
264
+
265
+ y : Ignored
266
+ Not used, present for API consistency by convention.
267
+
268
+ Returns
269
+ -------
270
+ self : object
271
+ Fitted estimator.
272
+ """
273
+ n_samples, n_features = X.shape
274
+
275
+ if n_features == 1:
276
+ # Converting time series data into matrix format
277
+ X = Window(window = self.slidingWindow).convert(X)
278
+
279
+ # validate inputs X and y (optional)
280
+ X = check_array(X)
281
+ self._set_n_classes(y)
282
+
283
+ n_samples, n_features = X.shape[0], X.shape[1]
284
+ X = MinMaxScaler(feature_range=(0,1)).fit_transform(X.T).T
285
+
286
+ # conduct standardization if needed
287
+ if self.preprocessing:
288
+ self.mean, self.std = np.mean(X, axis=0), np.std(X, axis=0)
289
+ self.std = np.where(self.std == 0, 1e-8, self.std)
290
+ train_set = TSDataset(X=X, mean=self.mean, std=self.std)
291
+ else:
292
+ train_set = TSDataset(X=X)
293
+
294
+ train_loader = torch.utils.data.DataLoader(train_set, batch_size=self.batch_size, shuffle=True, drop_last=True)
295
+
296
+ # initialize the model
297
+ self.model = InnerAutoencoder(
298
+ n_features=n_features,
299
+ hidden_neurons=self.hidden_neurons,
300
+ dropout_rate=self.dropout_rate,
301
+ batch_norm=self.batch_norm,
302
+ hidden_activation=self.hidden_activation)
303
+
304
+ # move to device and print model information
305
+ self.model = self.model.to(self.device)
306
+ if self.verbose:
307
+ print(self.model)
308
+
309
+ # train the autoencoder to find the best one
310
+ self._train_autoencoder(train_loader)
311
+
312
+ self.model.load_state_dict(self.best_model_dict)
313
+ self.decision_scores_ = self.decision_function(X)
314
+
315
+ self._process_decision_scores()
316
+ return self
317
+
318
+ def _train_autoencoder(self, train_loader):
319
+ """Internal function to train the autoencoder
320
+
321
+ Parameters
322
+ ----------
323
+ train_loader : torch dataloader
324
+ Train data.
325
+ """
326
+ optimizer = torch.optim.Adam(
327
+ self.model.parameters(), lr=self.learning_rate,
328
+ weight_decay=self.weight_decay)
329
+
330
+ self.best_loss = float('inf')
331
+ self.best_model_dict = None
332
+
333
+ for epoch in range(self.epochs):
334
+ overall_loss = []
335
+ for data, data_idx in train_loader:
336
+ data = data.to(self.device).float()
337
+ loss = self.loss_fn(data, self.model(data))
338
+
339
+ self.model.zero_grad()
340
+ loss.backward()
341
+ optimizer.step()
342
+ overall_loss.append(loss.item())
343
+ if self.verbose:
344
+ print('epoch {epoch}: training loss {train_loss} '.format(
345
+ epoch=epoch, train_loss=np.mean(overall_loss)))
346
+
347
+ # track the best model so far
348
+ if np.mean(overall_loss) <= self.best_loss:
349
+ # print("epoch {ep} is the current best; loss={loss}".format(ep=epoch, loss=np.mean(overall_loss)))
350
+ self.best_loss = np.mean(overall_loss)
351
+ self.best_model_dict = self.model.state_dict()
352
+
353
+ def decision_function(self, X):
354
+ """Predict raw anomaly score of X using the fitted detector.
355
+
356
+ The anomaly score of an input sample is computed based on different
357
+ detector algorithms. For consistency, outliers are assigned with
358
+ larger anomaly scores.
359
+
360
+ Parameters
361
+ ----------
362
+ X : numpy array of shape (n_samples, n_features)
363
+ The training input samples. Sparse matrices are accepted only
364
+ if they are supported by the base estimator.
365
+
366
+ Returns
367
+ -------
368
+ anomaly_scores : numpy array of shape (n_samples,)
369
+ The anomaly score of the input samples.
370
+ """
371
+ check_is_fitted(self, ['model', 'best_model_dict'])
372
+
373
+ n_samples, n_features = X.shape
374
+
375
+ if n_features == 1:
376
+ # Converting time series data into matrix format
377
+ X = Window(window = self.slidingWindow).convert(X)
378
+
379
+ X = check_array(X)
380
+ X = MinMaxScaler(feature_range=(0,1)).fit_transform(X.T).T
381
+
382
+ # note the shuffle may be true but should be False
383
+ if self.preprocessing:
384
+ dataset = TSDataset(X=X, mean=self.mean, std=self.std)
385
+ else:
386
+ dataset = TSDataset(X=X)
387
+
388
+ dataloader = torch.utils.data.DataLoader(dataset,
389
+ batch_size=self.batch_size,
390
+ shuffle=False)
391
+ # enable the evaluation mode
392
+ self.model.eval()
393
+
394
+ # construct the vector for holding the reconstruction error
395
+ outlier_scores = np.zeros([X.shape[0], ])
396
+ with torch.no_grad():
397
+ for data, data_idx in dataloader:
398
+ data_cuda = data.to(self.device).float()
399
+ # this is the outlier score
400
+ outlier_scores[data_idx] = pairwise_distances_no_broadcast(
401
+ data, self.model(data_cuda).cpu().numpy())
402
+
403
+ if outlier_scores.shape[0] < n_samples:
404
+ outlier_scores = np.array([outlier_scores[0]]*math.ceil((self.slidingWindow-1)/2) +
405
+ list(outlier_scores) + [outlier_scores[-1]]*((self.slidingWindow-1)//2))
406
+
407
+ return outlier_scores
models/CBLOF.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This function is adapted from [pyod] by [yzhao062]
3
+ Original source: [https://github.com/yzhao062/pyod]
4
+ """
5
+
6
+ from __future__ import division
7
+ from __future__ import print_function
8
+ import warnings
9
+
10
+ import numpy as np
11
+ from scipy.spatial.distance import cdist
12
+ from sklearn.cluster import KMeans
13
+ from sklearn.utils import check_array
14
+ from sklearn.utils.validation import check_is_fitted
15
+ from sklearn.utils.estimator_checks import check_estimator
16
+
17
+ from ..utils.stat_models import pairwise_distances_no_broadcast
18
+ from ..utils.utility import check_parameter
19
+ from .base import BaseDetector
20
+ from ..utils.utility import zscore
21
+
22
+
23
+ class CBLOF(BaseDetector):
24
+ r"""The CBLOF operator calculates the outlier score based on cluster-based
25
+ local outlier factor.
26
+
27
+ CBLOF takes as an input the data set and the cluster model that was
28
+ generated by a clustering algorithm. It classifies the clusters into small
29
+ clusters and large clusters using the parameters alpha and beta.
30
+ The anomaly score is then calculated based on the size of the cluster the
31
+ point belongs to as well as the distance to the nearest large cluster.
32
+
33
+ Use weighting for outlier factor based on the sizes of the clusters as
34
+ proposed in the original publication. Since this might lead to unexpected
35
+ behavior (outliers close to small clusters are not found), it is disabled
36
+ by default.Outliers scores are solely computed based on their distance to
37
+ the closest large cluster center.
38
+
39
+ By default, kMeans is used for clustering algorithm instead of
40
+ Squeezer algorithm mentioned in the original paper for multiple reasons.
41
+
42
+ See :cite:`he2003discovering` for details.
43
+
44
+ Parameters
45
+ ----------
46
+ n_clusters : int, optional (default=8)
47
+ The number of clusters to form as well as the number of
48
+ centroids to generate.
49
+
50
+ contamination : float in (0., 0.5), optional (default=0.1)
51
+ The amount of contamination of the data set,
52
+ i.e. the proportion of outliers in the data set. Used when fitting to
53
+ define the threshold on the decision function.
54
+
55
+ clustering_estimator : Estimator, optional (default=None)
56
+ The base clustering algorithm for performing data clustering.
57
+ A valid clustering algorithm should be passed in. The estimator should
58
+ have standard sklearn APIs, fit() and predict(). The estimator should
59
+ have attributes ``labels_`` and ``cluster_centers_``.
60
+ If ``cluster_centers_`` is not in the attributes once the model is fit,
61
+ it is calculated as the mean of the samples in a cluster.
62
+
63
+ If not set, CBLOF uses KMeans for scalability. See
64
+ https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
65
+
66
+ alpha : float in (0.5, 1), optional (default=0.9)
67
+ Coefficient for deciding small and large clusters. The ratio
68
+ of the number of samples in large clusters to the number of samples in
69
+ small clusters.
70
+
71
+ beta : int or float in (1,), optional (default=5).
72
+ Coefficient for deciding small and large clusters. For a list
73
+ sorted clusters by size `|C1|, \|C2|, ..., |Cn|, beta = |Ck|/|Ck-1|`
74
+
75
+ use_weights : bool, optional (default=False)
76
+ If set to True, the size of clusters are used as weights in
77
+ outlier score calculation.
78
+
79
+ check_estimator : bool, optional (default=False)
80
+ If set to True, check whether the base estimator is consistent with
81
+ sklearn standard.
82
+
83
+ .. warning::
84
+ check_estimator may throw errors with scikit-learn 0.20 above.
85
+
86
+ random_state : int, RandomState or None, optional (default=None)
87
+ If int, random_state is the seed used by the random
88
+ number generator; If RandomState instance, random_state is the random
89
+ number generator; If None, the random number generator is the
90
+ RandomState instance used by `np.random`.
91
+
92
+
93
+ Attributes
94
+ ----------
95
+ clustering_estimator_ : Estimator, sklearn instance
96
+ Base estimator for clustering.
97
+
98
+ cluster_labels_ : list of shape (n_samples,)
99
+ Cluster assignment for the training samples.
100
+
101
+ n_clusters_ : int
102
+ Actual number of clusters (possibly different from n_clusters).
103
+
104
+ cluster_sizes_ : list of shape (n_clusters_,)
105
+ The size of each cluster once fitted with the training data.
106
+
107
+ decision_scores_ : numpy array of shape (n_samples,)
108
+ The outlier scores of the training data.
109
+ The higher, the more abnormal. Outliers tend to have higher scores.
110
+ This value is available once the detector is fitted.
111
+
112
+ cluster_centers_ : numpy array of shape (n_clusters_, n_features)
113
+ The center of each cluster.
114
+
115
+ small_cluster_labels_ : list of clusters numbers
116
+ The cluster assignments belonging to small clusters.
117
+
118
+ large_cluster_labels_ : list of clusters numbers
119
+ The cluster assignments belonging to large clusters.
120
+
121
+ threshold_ : float
122
+ The threshold is based on ``contamination``. It is the
123
+ ``n_samples * contamination`` most abnormal samples in
124
+ ``decision_scores_``. The threshold is calculated for generating
125
+ binary outlier labels.
126
+
127
+ labels_ : int, either 0 or 1
128
+ The binary labels of the training data. 0 stands for inliers
129
+ and 1 for outliers/anomalies. It is generated by applying
130
+ ``threshold_`` on ``decision_scores_``.
131
+ """
132
+
133
+ def __init__(self, n_clusters=8, contamination=0.1,
134
+ clustering_estimator=None, alpha=0.9, beta=5,
135
+ use_weights=False, check_estimator=False, random_state=0,
136
+ n_jobs=1, normalize=True):
137
+ super(CBLOF, self).__init__(contamination=contamination)
138
+ self.n_clusters = n_clusters
139
+ self.clustering_estimator = clustering_estimator
140
+ self.alpha = alpha
141
+ self.beta = beta
142
+ self.use_weights = use_weights
143
+ self.check_estimator = check_estimator
144
+ self.random_state = random_state
145
+ self.normalize = normalize
146
+
147
+ # noinspection PyIncorrectDocstring
148
+ def fit(self, X, y=None):
149
+ """Fit detector. y is ignored in unsupervised methods.
150
+
151
+ Parameters
152
+ ----------
153
+ X : numpy array of shape (n_samples, n_features)
154
+ The input samples.
155
+
156
+ y : Ignored
157
+ Not used, present for API consistency by convention.
158
+
159
+ Returns
160
+ -------
161
+ self : object
162
+ Fitted estimator.
163
+ """
164
+
165
+ # validate inputs X and y (optional)
166
+ X = check_array(X)
167
+ self._set_n_classes(y)
168
+ n_samples, n_features = X.shape
169
+ if self.normalize: X = zscore(X, axis=1, ddof=1)
170
+
171
+ # check parameters
172
+ # number of clusters are default to 8
173
+ self._validate_estimator(default=KMeans(
174
+ n_clusters=self.n_clusters,
175
+ random_state=self.random_state))
176
+
177
+ self.clustering_estimator_.fit(X=X, y=y)
178
+ # Get the labels of the clustering results
179
+ # labels_ is consistent across sklearn clustering algorithms
180
+ self.cluster_labels_ = self.clustering_estimator_.labels_
181
+ self.cluster_sizes_ = np.bincount(self.cluster_labels_)
182
+
183
+ # Get the actual number of clusters
184
+ self.n_clusters_ = self.cluster_sizes_.shape[0]
185
+
186
+ if self.n_clusters_ != self.n_clusters:
187
+ warnings.warn("The chosen clustering for CBLOF forms {0} clusters"
188
+ "which is inconsistent with n_clusters ({1}).".
189
+ format(self.n_clusters_, self.n_clusters))
190
+
191
+ self._set_cluster_centers(X, n_features)
192
+ self._set_small_large_clusters(n_samples)
193
+
194
+ self.decision_scores_ = self._decision_function(X,
195
+ self.cluster_labels_)
196
+
197
+ self._process_decision_scores()
198
+ return self
199
+
200
+ def decision_function(self, X):
201
+ """Predict raw anomaly score of X using the fitted detector.
202
+
203
+ The anomaly score of an input sample is computed based on different
204
+ detector algorithms. For consistency, outliers are assigned with
205
+ larger anomaly scores.
206
+
207
+ Parameters
208
+ ----------
209
+ X : numpy array of shape (n_samples, n_features)
210
+ The training input samples. Sparse matrices are accepted only
211
+ if they are supported by the base estimator.
212
+
213
+ Returns
214
+ -------
215
+ anomaly_scores : numpy array of shape (n_samples,)
216
+ The anomaly score of the input samples.
217
+ """
218
+ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
219
+ X = check_array(X)
220
+ labels = self.clustering_estimator_.predict(X)
221
+ return self._decision_function(X, labels)
222
+
223
+ def _validate_estimator(self, default=None):
224
+ """Check the value of alpha and beta and clustering algorithm.
225
+ """
226
+ check_parameter(self.alpha, low=0, high=1, param_name='alpha',
227
+ include_left=False, include_right=False)
228
+
229
+ check_parameter(self.beta, low=1, param_name='beta',
230
+ include_left=False)
231
+
232
+ if self.clustering_estimator is not None:
233
+ self.clustering_estimator_ = self.clustering_estimator
234
+ else:
235
+ self.clustering_estimator_ = default
236
+
237
+ # make sure the base clustering algorithm is valid
238
+ if self.clustering_estimator_ is None:
239
+ raise ValueError("clustering algorithm cannot be None")
240
+
241
+ if self.check_estimator:
242
+ check_estimator(self.clustering_estimator_)
243
+
244
+ def _set_cluster_centers(self, X, n_features):
245
+ # Noted not all clustering algorithms have cluster_centers_
246
+ if hasattr(self.clustering_estimator_, 'cluster_centers_'):
247
+ self.cluster_centers_ = self.clustering_estimator_.cluster_centers_
248
+ else:
249
+ # Set the cluster center as the mean of all the samples within
250
+ # the cluster
251
+ warnings.warn("The chosen clustering for CBLOF does not have"
252
+ "the center of clusters. Calculate the center"
253
+ "as the mean of the clusters.")
254
+ self.cluster_centers_ = np.zeros([self.n_clusters_, n_features])
255
+ for i in range(self.n_clusters_):
256
+ self.cluster_centers_[i, :] = np.mean(
257
+ X[np.where(self.cluster_labels_ == i)], axis=0)
258
+
259
+ def _set_small_large_clusters(self, n_samples):
260
+ # Sort the index of clusters by the number of samples belonging to it
261
+ size_clusters = np.bincount(self.cluster_labels_)
262
+
263
+ # Sort the order from the largest to the smallest
264
+ sorted_cluster_indices = np.argsort(size_clusters * -1)
265
+
266
+ # Initialize the lists of index that fulfill the requirements by
267
+ # either alpha or beta
268
+ alpha_list = []
269
+ beta_list = []
270
+
271
+ for i in range(1, self.n_clusters_):
272
+ temp_sum = np.sum(size_clusters[sorted_cluster_indices[:i]])
273
+ if temp_sum >= n_samples * self.alpha:
274
+ alpha_list.append(i)
275
+
276
+ if size_clusters[sorted_cluster_indices[i - 1]] / size_clusters[
277
+ sorted_cluster_indices[i]] >= self.beta:
278
+ beta_list.append(i)
279
+
280
+ # Find the separation index fulfills both alpha and beta
281
+ intersection = np.intersect1d(alpha_list, beta_list)
282
+
283
+ if len(intersection) > 0:
284
+ self._clustering_threshold = intersection[0]
285
+ elif len(alpha_list) > 0:
286
+ self._clustering_threshold = alpha_list[0]
287
+ elif len(beta_list) > 0:
288
+ self._clustering_threshold = beta_list[0]
289
+ else:
290
+ raise ValueError("Could not form valid cluster separation. Please "
291
+ "change n_clusters or change clustering method")
292
+
293
+ self.small_cluster_labels_ = sorted_cluster_indices[
294
+ self._clustering_threshold:]
295
+ self.large_cluster_labels_ = sorted_cluster_indices[
296
+ 0:self._clustering_threshold]
297
+
298
+ # No need to calculate small cluster center
299
+ # self.small_cluster_centers_ = self.cluster_centers_[
300
+ # self.small_cluster_labels_]
301
+
302
+ self._large_cluster_centers = self.cluster_centers_[
303
+ self.large_cluster_labels_]
304
+
305
+ def _decision_function(self, X, labels):
306
+ # Initialize the score array
307
+ scores = np.zeros([X.shape[0], ])
308
+
309
+ small_indices = np.where(
310
+ np.isin(labels, self.small_cluster_labels_))[0]
311
+ large_indices = np.where(
312
+ np.isin(labels, self.large_cluster_labels_))[0]
313
+
314
+ if small_indices.shape[0] != 0:
315
+ # Calculate the outlier factor for the samples in small clusters
316
+ dist_to_large_center = cdist(X[small_indices, :],
317
+ self._large_cluster_centers)
318
+
319
+ scores[small_indices] = np.min(dist_to_large_center, axis=1)
320
+
321
+ if large_indices.shape[0] != 0:
322
+ # Calculate the outlier factor for the samples in large clusters
323
+ large_centers = self.cluster_centers_[labels[large_indices]]
324
+
325
+ scores[large_indices] = pairwise_distances_no_broadcast(
326
+ X[large_indices, :], large_centers)
327
+
328
+ if self.use_weights:
329
+ # Weights are calculated as the number of elements in the cluster
330
+ scores = scores * self.cluster_sizes_[labels]
331
+
332
+ return scores.ravel()
models/CNN.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+ import torchinfo
3
+ import tqdm, math
4
+ import numpy as np
5
+ import torch
6
+ from torch import nn, optim
7
+ from torch.utils.data import DataLoader
8
+
9
+ from ..utils.utility import get_activation_by_name
10
+ from ..utils.torch_utility import EarlyStoppingTorch, get_gpu
11
+ from ..utils.dataset import ForecastDataset
12
+
13
+ class AdaptiveConcatPool1d(nn.Module):
14
+ def __init__(self):
15
+ super().__init__()
16
+ self.ap = torch.nn.AdaptiveAvgPool1d(1)
17
+ self.mp = torch.nn.AdaptiveAvgPool1d(1)
18
+
19
+ def forward(self, x):
20
+ return torch.cat([self.ap(x), self.mp(x)], 1)
21
+
22
+ class CNNModel(nn.Module):
23
+ def __init__(self,
24
+ n_features,
25
+ num_channel=[32, 32, 40],
26
+ kernel_size=3,
27
+ stride=1,
28
+ predict_time_steps=1,
29
+ dropout_rate=0.25,
30
+ hidden_activation='relu',
31
+ device='cpu'):
32
+
33
+ # initialize the super class
34
+ super(CNNModel, self).__init__()
35
+
36
+ # save the default values
37
+ self.n_features = n_features
38
+ self.dropout_rate = dropout_rate
39
+ self.hidden_activation = hidden_activation
40
+ self.kernel_size = kernel_size
41
+ self.stride = stride
42
+ self.predict_time_steps = predict_time_steps
43
+ self.num_channel = num_channel
44
+ self.device = device
45
+
46
+ # get the object for the activations functions
47
+ self.activation = get_activation_by_name(hidden_activation)
48
+
49
+ # initialize encoder and decoder as a sequential
50
+ self.conv_layers = nn.Sequential()
51
+ prev_channels = self.n_features
52
+
53
+ for idx, out_channels in enumerate(self.num_channel[:-1]):
54
+ self.conv_layers.add_module(
55
+ "conv" + str(idx),
56
+ torch.nn.Conv1d(prev_channels, self.num_channel[idx + 1],
57
+ self.kernel_size, self.stride))
58
+ self.conv_layers.add_module(self.hidden_activation + str(idx),
59
+ self.activation)
60
+ self.conv_layers.add_module("pool" + str(idx), nn.MaxPool1d(kernel_size=2))
61
+ prev_channels = out_channels
62
+
63
+ self.fc = nn.Sequential(
64
+ AdaptiveConcatPool1d(),
65
+ torch.nn.Flatten(),
66
+ torch.nn.Linear(2*self.num_channel[-1], self.num_channel[-1]),
67
+ torch.nn.ReLU(),
68
+ torch.nn.Dropout(dropout_rate),
69
+ torch.nn.Linear(self.num_channel[-1], self.n_features)
70
+ )
71
+
72
+ def forward(self, x):
73
+ b, l, c = x.shape
74
+ x = x.view(b, c, l)
75
+ x = self.conv_layers(x) # [128, feature, 23]
76
+
77
+ outputs = torch.zeros(self.predict_time_steps, b, self.n_features).to(self.device)
78
+ for t in range(self.predict_time_steps):
79
+ decoder_input = self.fc(x)
80
+ outputs[t] = torch.squeeze(decoder_input, dim=-2)
81
+
82
+ return outputs
83
+
84
+ class CNN():
85
+ def __init__(self,
86
+ window_size=100,
87
+ pred_len=1,
88
+ batch_size=128,
89
+ epochs=50,
90
+ lr=0.0008,
91
+ feats=1,
92
+ num_channel=[32, 32, 40],
93
+ validation_size=0.2):
94
+ super().__init__()
95
+ self.__anomaly_score = None
96
+
97
+ cuda = True
98
+ self.y_hats = None
99
+
100
+ self.cuda = cuda
101
+ self.device = get_gpu(self.cuda)
102
+
103
+ self.window_size = window_size
104
+ self.pred_len = pred_len
105
+ self.batch_size = batch_size
106
+ self.epochs = epochs
107
+
108
+ self.feats = feats
109
+ self.num_channel = num_channel
110
+ self.lr = lr
111
+ self.validation_size = validation_size
112
+
113
+ self.model = CNNModel(n_features=feats, num_channel=num_channel, predict_time_steps=self.pred_len, device=self.device).to(self.device)
114
+
115
+ self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
116
+ self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=5, gamma=0.75)
117
+ self.loss = nn.MSELoss()
118
+ self.save_path = None
119
+ self.early_stopping = EarlyStoppingTorch(save_path=self.save_path, patience=3)
120
+
121
+ self.mu = None
122
+ self.sigma = None
123
+ self.eps = 1e-10
124
+
125
+ def fit(self, data):
126
+ tsTrain = data[:int((1-self.validation_size)*len(data))]
127
+ tsValid = data[int((1-self.validation_size)*len(data)):]
128
+
129
+ train_loader = DataLoader(
130
+ ForecastDataset(tsTrain, window_size=self.window_size, pred_len=self.pred_len),
131
+ batch_size=self.batch_size,
132
+ shuffle=True)
133
+
134
+ valid_loader = DataLoader(
135
+ ForecastDataset(tsValid, window_size=self.window_size, pred_len=self.pred_len),
136
+ batch_size=self.batch_size,
137
+ shuffle=False)
138
+
139
+ for epoch in range(1, self.epochs + 1):
140
+ self.model.train(mode=True)
141
+ avg_loss = 0
142
+ loop = tqdm.tqdm(enumerate(train_loader),total=len(train_loader),leave=True)
143
+ for idx, (x, target) in loop:
144
+ x, target = x.to(self.device), target.to(self.device)
145
+
146
+ # print('x: ', x.shape) # (bs, win, feat)
147
+ # print('target: ', target.shape) # # (bs, pred_len, feat)
148
+ # print('len(tsTrain): ', len(tsTrain))
149
+ # print('len(train_loader): ', len(train_loader))
150
+
151
+ self.optimizer.zero_grad()
152
+
153
+ output = self.model(x)
154
+ output = output.view(-1, self.feats*self.pred_len)
155
+ target = target.view(-1, self.feats*self.pred_len)
156
+
157
+ loss = self.loss(output, target)
158
+ loss.backward()
159
+
160
+ self.optimizer.step()
161
+
162
+ avg_loss += loss.cpu().item()
163
+ loop.set_description(f'Training Epoch [{epoch}/{self.epochs}]')
164
+ loop.set_postfix(loss=loss.item(), avg_loss=avg_loss/(idx+1))
165
+
166
+
167
+ self.model.eval()
168
+ scores = []
169
+ avg_loss = 0
170
+ loop = tqdm.tqdm(enumerate(valid_loader),total=len(valid_loader),leave=True)
171
+ with torch.no_grad():
172
+ for idx, (x, target) in loop:
173
+ x, target = x.to(self.device), target.to(self.device)
174
+
175
+ output = self.model(x)
176
+
177
+ output = output.view(-1, self.feats*self.pred_len)
178
+ target = target.view(-1, self.feats*self.pred_len)
179
+
180
+ loss = self.loss(output, target)
181
+ avg_loss += loss.cpu().item()
182
+ loop.set_description(f'Validation Epoch [{epoch}/{self.epochs}]')
183
+ loop.set_postfix(loss=loss.item(), avg_loss=avg_loss/(idx+1))
184
+
185
+ mse = torch.sub(output, target).pow(2)
186
+ scores.append(mse.cpu())
187
+
188
+
189
+ valid_loss = avg_loss/max(len(valid_loader), 1)
190
+ self.scheduler.step()
191
+
192
+ self.early_stopping(valid_loss, self.model)
193
+ if self.early_stopping.early_stop or epoch == self.epochs - 1:
194
+ # fitting Gaussian Distribution
195
+ if len(scores) > 0:
196
+ scores = torch.cat(scores, dim=0)
197
+ self.mu = torch.mean(scores)
198
+ self.sigma = torch.var(scores)
199
+ print(self.mu.size(), self.sigma.size())
200
+ if self.early_stopping.early_stop:
201
+ print(" Early stopping<<<")
202
+ break
203
+
204
+ def decision_function(self, data):
205
+ test_loader = DataLoader(
206
+ ForecastDataset(data, window_size=self.window_size, pred_len=self.pred_len),
207
+ batch_size=self.batch_size,
208
+ shuffle=False
209
+ )
210
+
211
+ self.model.eval()
212
+ scores = []
213
+ y_hats = []
214
+ loop = tqdm.tqdm(enumerate(test_loader),total=len(test_loader),leave=True)
215
+ with torch.no_grad():
216
+ for idx, (x, target) in loop:
217
+ x, target = x.to(self.device), target.to(self.device)
218
+ output = self.model(x)
219
+
220
+ output = output.view(-1, self.feats*self.pred_len)
221
+ target = target.view(-1, self.feats*self.pred_len)
222
+
223
+ mse = torch.sub(output, target).pow(2)
224
+
225
+ y_hats.append(output.cpu())
226
+ scores.append(mse.cpu())
227
+ loop.set_description(f'Testing: ')
228
+
229
+ scores = torch.cat(scores, dim=0)
230
+ # scores = 0.5 * (torch.log(self.sigma + self.eps) + (scores - self.mu)**2 / (self.sigma+self.eps))
231
+
232
+ scores = scores.numpy()
233
+ scores = np.mean(scores, axis=1)
234
+
235
+ y_hats = torch.cat(y_hats, dim=0)
236
+ y_hats = y_hats.numpy()
237
+
238
+ l, w = y_hats.shape
239
+
240
+ # new_scores = np.zeros((l - self.pred_len, w))
241
+ # for i in range(w):
242
+ # new_scores[:, i] = scores[self.pred_len - i:l-i, i]
243
+ # scores = np.mean(new_scores, axis=1)
244
+ # scores = np.pad(scores, (0, self.pred_len - 1), 'constant', constant_values=(0,0))
245
+
246
+ # new_y_hats = np.zeros((l - self.pred_len, w))
247
+ # for i in range(w):
248
+ # new_y_hats[:, i] = y_hats[self.pred_len - i:l-i, i]
249
+ # y_hats = np.mean(new_y_hats, axis=1)
250
+ # y_hats = np.pad(y_hats, (0, self.pred_len - 1), 'constant',constant_values=(0,0))
251
+
252
+ assert scores.ndim == 1
253
+ # self.y_hats = y_hats
254
+
255
+ print('scores: ', scores.shape)
256
+ if scores.shape[0] < len(data):
257
+ padded_decision_scores_ = np.zeros(len(data))
258
+ padded_decision_scores_[: self.window_size+self.pred_len-1] = scores[0]
259
+ padded_decision_scores_[self.window_size+self.pred_len-1 : ] = scores
260
+
261
+ self.__anomaly_score = padded_decision_scores_
262
+ return padded_decision_scores_
263
+
264
+ def anomaly_score(self) -> np.ndarray:
265
+ return self.__anomaly_score
266
+
267
+ def get_y_hat(self) -> np.ndarray:
268
+ return self.y_hats
269
+
270
+ def param_statistic(self, save_file):
271
+ model_stats = torchinfo.summary(self.model, (self.batch_size, self.window_size), verbose=0)
272
+ with open(save_file, 'w') as f:
273
+ f.write(str(model_stats))
models/COF.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ This function is adapted from [pyod] by [yzhao062]
4
+ Original source: [https://github.com/yzhao062/pyod]
5
+ """
6
+
7
+ from __future__ import division
8
+ from __future__ import print_function
9
+
10
+ import warnings
11
+ from operator import itemgetter
12
+
13
+ import numpy as np
14
+ from scipy.spatial import distance_matrix
15
+ from scipy.spatial import minkowski_distance
16
+ from sklearn.utils import check_array
17
+
18
+ from .base import BaseDetector
19
+ from ..utils.utility import check_parameter
20
+
21
+
22
+ class COF(BaseDetector):
23
+ """Connectivity-Based Outlier Factor (COF) COF uses the ratio of average
24
+ chaining distance of data point and the average of average chaining
25
+ distance of k nearest neighbor of the data point, as the outlier score
26
+ for observations.
27
+
28
+ See :cite:`tang2002enhancing` for details.
29
+
30
+ Two version of COF are supported:
31
+
32
+ - Fast COF: computes the entire pairwise distance matrix at the cost of a
33
+ O(n^2) memory requirement.
34
+ - Memory efficient COF: calculates pairwise distances incrementally.
35
+ Use this implementation when it is not feasible to fit the n-by-n
36
+ distance in memory. This leads to a linear overhead because many
37
+ distances will have to be recalculated.
38
+
39
+ Parameters
40
+ ----------
41
+ contamination : float in (0., 0.5), optional (default=0.1)
42
+ The amount of contamination of the data set, i.e.
43
+ the proportion of outliers in the data set. Used when fitting to
44
+ define the threshold on the decision function.
45
+
46
+ n_neighbors : int, optional (default=20)
47
+ Number of neighbors to use by default for k neighbors queries.
48
+ Note that n_neighbors should be less than the number of samples.
49
+ If n_neighbors is larger than the number of samples provided,
50
+ all samples will be used.
51
+
52
+ method : string, optional (default='fast')
53
+ Valid values for method are:
54
+
55
+ - 'fast' Fast COF, computes the full pairwise distance matrix up front.
56
+ - 'memory' Memory-efficient COF, computes pairwise distances only when
57
+ needed at the cost of computational speed.
58
+
59
+ Attributes
60
+ ----------
61
+ decision_scores_ : numpy array of shape (n_samples,)
62
+ The outlier scores of the training data.
63
+ The higher, the more abnormal. Outliers tend to have higher
64
+ scores. This value is available once the detector is
65
+ fitted.
66
+
67
+ threshold_ : float
68
+ The threshold is based on ``contamination``. It is the
69
+ ``n_samples * contamination`` most abnormal samples in
70
+ ``decision_scores_``. The threshold is calculated for generating
71
+ binary outlier labels.
72
+
73
+ labels_ : int, either 0 or 1
74
+ The binary labels of the training data. 0 stands for inliers
75
+ and 1 for outliers/anomalies. It is generated by applying
76
+ ``threshold_`` on ``decision_scores_``.
77
+
78
+ n_neighbors_: int
79
+ Number of neighbors to use by default for k neighbors queries.
80
+ """
81
+
82
+ def __init__(self, contamination=0.1, n_neighbors=20, method="fast"):
83
+ super(COF, self).__init__(contamination=contamination)
84
+ if isinstance(n_neighbors, int):
85
+ check_parameter(n_neighbors, low=1, param_name='n_neighbors')
86
+ else:
87
+ raise TypeError(
88
+ "n_neighbors should be int. Got %s" % type(n_neighbors))
89
+ self.n_neighbors = n_neighbors
90
+ self.method = method
91
+
92
+ def fit(self, X, y=None):
93
+ """Fit detector. y is ignored in unsupervised methods.
94
+
95
+ Parameters
96
+ ----------
97
+ X : numpy array of shape (n_samples, n_features)
98
+ The input samples.
99
+
100
+ y : Ignored
101
+ Not used, present for API consistency by convention.
102
+
103
+ Returns
104
+ -------
105
+ self : object
106
+ Fitted estimator.
107
+ """
108
+ X = check_array(X)
109
+ self.n_train_ = X.shape[0]
110
+ self.n_neighbors_ = self.n_neighbors
111
+
112
+ if self.n_neighbors_ >= self.n_train_:
113
+ self.n_neighbors_ = self.n_train_ - 1
114
+ warnings.warn(
115
+ "n_neighbors is set to the number of training points "
116
+ "minus 1: {0}".format(self.n_neighbors_))
117
+
118
+ check_parameter(self.n_neighbors_, 1, self.n_train_,
119
+ include_left=True, include_right=True)
120
+
121
+ self._set_n_classes(y)
122
+ self.decision_scores_ = self.decision_function(X)
123
+ self._process_decision_scores()
124
+
125
+ return self
126
+
127
+ def decision_function(self, X):
128
+ """Predict raw anomaly score of X using the fitted detector.
129
+ The anomaly score of an input sample is computed based on different
130
+ detector algorithms. For consistency, outliers are assigned with
131
+ larger anomaly scores.
132
+
133
+ Parameters
134
+ ----------
135
+ X : numpy array of shape (n_samples, n_features)
136
+ The training input samples. Sparse matrices are accepted only
137
+ if they are supported by the base estimator.
138
+
139
+ Returns
140
+ -------
141
+ anomaly_scores : numpy array of shape (n_samples,)
142
+ The anomaly score of the input samples.
143
+ """
144
+ if self.method.lower() == "fast":
145
+ return self._cof_fast(X)
146
+ elif self.method.lower() == "memory":
147
+ return self._cof_memory(X)
148
+ else:
149
+ raise ValueError("method should be set to either \'fast\' or \'memory\'. Got %s" % self.method)
150
+
151
+ def _cof_memory(self, X):
152
+ """
153
+ Connectivity-Based Outlier Factor (COF) Algorithm
154
+ This function is called internally to calculate the
155
+ Connectivity-Based Outlier Factor (COF) as an outlier
156
+ score for observations.
157
+ This function uses a memory efficient implementation at the cost of
158
+ speed.
159
+ :return: numpy array containing COF scores for observations.
160
+ The greater the COF, the greater the outlierness.
161
+ """
162
+ #dist_matrix = np.array(distance_matrix(X, X))
163
+ sbn_path_index = np.zeros((X.shape[0],self.n_neighbors_), dtype=np.int64)
164
+ ac_dist, cof_ = np.zeros((X.shape[0])), np.zeros((X.shape[0]))
165
+ for i in range(X.shape[0]):
166
+ #sbn_path = np.argsort(dist_matrix[i])
167
+ sbn_path = np.argsort(minkowski_distance(X[i,:],X,p=2))
168
+ sbn_path_index[i,:] = sbn_path[1: self.n_neighbors_ + 1]
169
+ cost_desc = np.zeros((self.n_neighbors_))
170
+ for j in range(self.n_neighbors_):
171
+ #cost_desc.append(
172
+ # np.min(dist_matrix[sbn_path[j + 1]][sbn_path][:j + 1]))
173
+ cost_desc[j] = np.min(minkowski_distance(X[sbn_path[j + 1]],X,p=2)[sbn_path][:j + 1])
174
+ acd = np.zeros((self.n_neighbors_))
175
+ for _h, cost_ in enumerate(cost_desc):
176
+ neighbor_add1 = self.n_neighbors_ + 1
177
+ acd[_h] = ((2. * (neighbor_add1 - (_h + 1))) / (neighbor_add1 * self.n_neighbors_)) * cost_
178
+ ac_dist[i] = np.sum(acd)
179
+ for _g in range(X.shape[0]):
180
+ cof_[_g] = (ac_dist[_g] * self.n_neighbors_) / np.sum(ac_dist[sbn_path_index[_g]])
181
+ return np.nan_to_num(cof_)
182
+
183
+ def _cof_fast(self, X):
184
+ """
185
+ Connectivity-Based Outlier Factor (COF) Algorithm
186
+ This function is called internally to calculate the
187
+ Connectivity-Based Outlier Factor (COF) as an outlier
188
+ score for observations.
189
+ This function uses a fast implementation at the cost of memory.
190
+ :return: numpy array containing COF scores for observations.
191
+ The greater the COF, the greater the outlierness.
192
+ """
193
+ dist_matrix = np.array(distance_matrix(X, X))
194
+ sbn_path_index, ac_dist, cof_ = [], [], []
195
+ for i in range(X.shape[0]):
196
+ sbn_path = np.argsort(dist_matrix[i])
197
+ sbn_path_index.append(sbn_path[1: self.n_neighbors_ + 1])
198
+ cost_desc = []
199
+ for j in range(self.n_neighbors_):
200
+ cost_desc.append(
201
+ np.min(dist_matrix[sbn_path[j + 1]][sbn_path][:j + 1]))
202
+ acd = []
203
+ for _h, cost_ in enumerate(cost_desc):
204
+ neighbor_add1 = self.n_neighbors_ + 1
205
+ acd.append(((2. * (neighbor_add1 - (_h + 1))) / (
206
+ neighbor_add1 * self.n_neighbors_)) * cost_)
207
+ ac_dist.append(np.sum(acd))
208
+ for _g in range(X.shape[0]):
209
+ cof_.append((ac_dist[_g] * self.n_neighbors_) /
210
+ np.sum(itemgetter(*sbn_path_index[_g])(ac_dist)))
211
+ return np.nan_to_num(cof_)
models/COPOD.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This function is adapted from [pyod] by [yzhao062]
3
+ Original source: [https://github.com/yzhao062/pyod]
4
+ """
5
+
6
+ from __future__ import division
7
+ from __future__ import print_function
8
+ import warnings
9
+
10
+ import numpy as np
11
+
12
+ from joblib import Parallel, delayed
13
+ from scipy.stats import skew as skew_sp
14
+ from sklearn.utils.validation import check_is_fitted
15
+ from sklearn.utils import check_array
16
+
17
+ from .base import BaseDetector
18
+ from ..utils.stat_models import column_ecdf
19
+ from ..utils.utility import _partition_estimators
20
+ from ..utils.utility import zscore
21
+
22
+ def skew(X, axis=0):
23
+ return np.nan_to_num(skew_sp(X, axis=axis))
24
+
25
+ def _parallel_ecdf(n_dims, X):
26
+ """Private method to calculate ecdf in parallel.
27
+ Parameters
28
+ ----------
29
+ n_dims : int
30
+ The number of dimensions of the current input matrix
31
+
32
+ X : numpy array
33
+ The subarray for building the ECDF
34
+
35
+ Returns
36
+ -------
37
+ U_l_mat : numpy array
38
+ ECDF subarray.
39
+
40
+ U_r_mat : numpy array
41
+ ECDF subarray.
42
+ """
43
+ U_l_mat = np.zeros([X.shape[0], n_dims])
44
+ U_r_mat = np.zeros([X.shape[0], n_dims])
45
+
46
+ for i in range(n_dims):
47
+ U_l_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1])
48
+ U_r_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1] * -1)
49
+ return U_l_mat, U_r_mat
50
+
51
+ class COPOD(BaseDetector):
52
+ """COPOD class for Copula Based Outlier Detector.
53
+ COPOD is a parameter-free, highly interpretable outlier detection algorithm
54
+ based on empirical copula models.
55
+ See :cite:`li2020copod` for details.
56
+
57
+ Parameters
58
+ ----------
59
+ contamination : float in (0., 0.5), optional (default=0.1)
60
+ The amount of contamination of the data set, i.e.
61
+ the proportion of outliers in the data set. Used when fitting to
62
+ define the threshold on the decision function.
63
+
64
+ n_jobs : optional (default=1)
65
+ The number of jobs to run in parallel for both `fit` and
66
+ `predict`. If -1, then the number of jobs is set to the
67
+ number of cores.
68
+
69
+ Attributes
70
+ ----------
71
+ decision_scores_ : numpy array of shape (n_samples,)
72
+ The outlier scores of the training data.
73
+ The higher, the more abnormal. Outliers tend to have higher
74
+ scores. This value is available once the detector is
75
+ fitted.
76
+ threshold_ : float
77
+ The threshold is based on ``contamination``. It is the
78
+ ``n_samples * contamination`` most abnormal samples in
79
+ ``decision_scores_``. The threshold is calculated for generating
80
+ binary outlier labels.
81
+ labels_ : int, either 0 or 1
82
+ The binary labels of the training data. 0 stands for inliers
83
+ and 1 for outliers/anomalies. It is generated by applying
84
+ ``threshold_`` on ``decision_scores_``.
85
+ """
86
+
87
+ def __init__(self, contamination=0.1, n_jobs=1, normalize=True):
88
+ super(COPOD, self).__init__(contamination=contamination)
89
+
90
+ #TODO: Make it parameterized for n_jobs
91
+ self.n_jobs = n_jobs
92
+ self.normalize = normalize
93
+
94
+ def fit(self, X, y=None):
95
+ """Fit detector. y is ignored in unsupervised methods.
96
+ Parameters
97
+ ----------
98
+ X : numpy array of shape (n_samples, n_features)
99
+ The input samples.
100
+ y : Ignored
101
+ Not used, present for API consistency by convention.
102
+ Returns
103
+ -------
104
+ self : object
105
+ Fitted estimator.
106
+ """
107
+ X = check_array(X)
108
+ if self.normalize: X = zscore(X, axis=1, ddof=1)
109
+
110
+ self._set_n_classes(y)
111
+ self.decision_scores_ = self.decision_function(X)
112
+ self.X_train = X
113
+ self._process_decision_scores()
114
+ return self
115
+
116
+ def decision_function(self, X):
117
+ """Predict raw anomaly score of X using the fitted detector.
118
+ For consistency, outliers are assigned with larger anomaly scores.
119
+ Parameters
120
+ ----------
121
+ X : numpy array of shape (n_samples, n_features)
122
+ The training input samples. Sparse matrices are accepted only
123
+ if they are supported by the base estimator.
124
+ Returns
125
+ -------
126
+ anomaly_scores : numpy array of shape (n_samples,)
127
+ The anomaly score of the input samples.
128
+ """
129
+ # use multi-thread execution
130
+ if self.n_jobs != 1:
131
+ return self._decision_function_parallel(X)
132
+ if hasattr(self, 'X_train'):
133
+ original_size = X.shape[0]
134
+ X = np.concatenate((self.X_train, X), axis=0)
135
+ self.U_l = -1 * np.log(column_ecdf(X))
136
+ self.U_r = -1 * np.log(column_ecdf(-X))
137
+
138
+ skewness = np.sign(skew(X, axis=0))
139
+ self.U_skew = self.U_l * -1 * np.sign(
140
+ skewness - 1) + self.U_r * np.sign(skewness + 1)
141
+ self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
142
+ if hasattr(self, 'X_train'):
143
+ decision_scores_ = self.O.sum(axis=1)[-original_size:]
144
+ else:
145
+ decision_scores_ = self.O.sum(axis=1)
146
+ return decision_scores_.ravel()
147
+
148
+ def _decision_function_parallel(self, X):
149
+ """Predict raw anomaly score of X using the fitted detector.
150
+ For consistency, outliers are assigned with larger anomaly scores.
151
+ Parameters
152
+ ----------
153
+ X : numpy array of shape (n_samples, n_features)
154
+ The training input samples. Sparse matrices are accepted only
155
+ if they are supported by the base estimator.
156
+ Returns
157
+ -------
158
+ anomaly_scores : numpy array of shape (n_samples,)
159
+ The anomaly score of the input samples.
160
+ """
161
+ if hasattr(self, 'X_train'):
162
+ original_size = X.shape[0]
163
+ X = np.concatenate((self.X_train, X), axis=0)
164
+
165
+ n_samples, n_features = X.shape[0], X.shape[1]
166
+
167
+ if n_features < 2:
168
+ raise ValueError(
169
+ 'n_jobs should not be used on one dimensional dataset')
170
+
171
+ if n_features <= self.n_jobs:
172
+ self.n_jobs = n_features
173
+ warnings.warn("n_features <= n_jobs; setting them equal instead.")
174
+
175
+ n_jobs, n_dims_list, starts = _partition_estimators(n_features,
176
+ self.n_jobs)
177
+
178
+ all_results = Parallel(n_jobs=n_jobs, max_nbytes=None,
179
+ verbose=True)(
180
+ delayed(_parallel_ecdf)(
181
+ n_dims_list[i],
182
+ X[:, starts[i]:starts[i + 1]],
183
+ )
184
+ for i in range(n_jobs))
185
+
186
+ # recover the results
187
+ self.U_l = np.zeros([n_samples, n_features])
188
+ self.U_r = np.zeros([n_samples, n_features])
189
+
190
+ for i in range(n_jobs):
191
+ self.U_l[:, starts[i]:starts[i + 1]] = all_results[i][0]
192
+ self.U_r[:, starts[i]:starts[i + 1]] = all_results[i][1]
193
+
194
+ self.U_l = -1 * np.log(self.U_l)
195
+ self.U_r = -1 * np.log(self.U_r)
196
+
197
+ skewness = np.sign(skew(X, axis=0))
198
+ self.U_skew = self.U_l * -1 * np.sign(
199
+ skewness - 1) + self.U_r * np.sign(skewness + 1)
200
+ self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
201
+ if hasattr(self, 'X_train'):
202
+ decision_scores_ = self.O.sum(axis=1)[-original_size:]
203
+ else:
204
+ decision_scores_ = self.O.sum(axis=1)
205
+ return decision_scores_.ravel()
models/Chronos.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This function is adapted from [chronos-forecasting] by [lostella et al.]
3
+ Original source: [https://github.com/amazon-science/chronos-forecasting]
4
+ """
5
+
6
+ from autogluon.timeseries import TimeSeriesPredictor
7
+ from sklearn.preprocessing import MinMaxScaler
8
+ import numpy as np
9
+ import pandas as pd
10
+ import tempfile
11
+
12
+ from .base import BaseDetector
13
+
14
+
15
+ class Chronos(BaseDetector):
16
+ def __init__(self,
17
+ win_size=100,
18
+ model_size = 'base', # [tiny, small, base]
19
+ prediction_length=1,
20
+ input_c=1,
21
+ batch_size=128):
22
+
23
+ self.model_name = 'Chronos'
24
+ self.model_size = model_size
25
+ self.win_size = win_size
26
+ self.prediction_length = prediction_length
27
+ self.input_c = input_c
28
+ self.batch_size = batch_size
29
+ self.score_list = []
30
+
31
+ def fit(self, data):
32
+
33
+ for channel in range(self.input_c):
34
+
35
+ data_channel = data[:, channel].reshape(-1, 1)
36
+ data_win, data_target = self.create_dataset(data_channel, slidingWindow=self.win_size, predict_time_steps=self.prediction_length)
37
+ # print('data_win: ', data_win.shape) # (2330, 100)
38
+ # print('data_target: ', data_target.shape) # (2330, 1)
39
+
40
+ train_data = []
41
+ count = 0
42
+ for id in range(data_win.shape[0]):
43
+ for tt in range(data_win.shape[1]):
44
+ train_data.append([id, count, data_win[id, tt]])
45
+ count += 1
46
+ train_data = pd.DataFrame(train_data, columns=['item_id', 'timestamp', 'target'])
47
+
48
+ with tempfile.TemporaryDirectory() as temp_dir:
49
+
50
+ predictor = TimeSeriesPredictor(prediction_length=self.prediction_length, path=temp_dir).fit(
51
+ train_data,
52
+ hyperparameters={
53
+ "Chronos": {
54
+ "model_path": self.model_size, # base
55
+ "device": "cuda",
56
+ "batch_size": self.batch_size}},
57
+ skip_model_selection=True,
58
+ verbosity=0)
59
+
60
+ predictions = predictor.predict(train_data)['mean'].to_numpy().reshape(-1, self.prediction_length)
61
+ print('predictions: ', predictions.shape)
62
+
63
+ ### using mse as the anomaly score
64
+ scores = (data_target.squeeze() - predictions.squeeze()) ** 2
65
+ self.score_list.append(scores)
66
+
67
+ scores_merge = np.mean(np.array(self.score_list), axis=0)
68
+ # print('scores_merge: ', scores_merge.shape)
69
+
70
+ padded_decision_scores = np.zeros(len(data))
71
+ padded_decision_scores[: self.win_size+self.prediction_length-1] = scores_merge[0]
72
+ padded_decision_scores[self.win_size+self.prediction_length-1 : ]=scores_merge
73
+
74
+ self.decision_scores_ = padded_decision_scores
75
+
76
+
77
+ def decision_function(self, X):
78
+ """
79
+ Not used, present for API consistency by convention.
80
+ """
81
+ pass
82
+
83
+ def create_dataset(self, X, slidingWindow, predict_time_steps=1):
84
+ Xs, ys = [], []
85
+ for i in range(len(X) - slidingWindow - predict_time_steps+1):
86
+
87
+ tmp = X[i : i + slidingWindow + predict_time_steps].ravel()
88
+ # tmp= MinMaxScaler(feature_range=(0,1)).fit_transform(tmp.reshape(-1,1)).ravel()
89
+
90
+ x = tmp[:slidingWindow]
91
+ y = tmp[slidingWindow:]
92
+ Xs.append(x)
93
+ ys.append(y)
94
+ return np.array(Xs), np.array(ys)
models/DADA.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ import torch.optim as optim
6
+ from torch.utils.data import DataLoader, TensorDataset
7
+ import math
8
+ import tqdm
9
+ import os
10
+ from transformers import AutoTokenizer
11
+ from typing import Optional, Tuple
12
+
13
+ # Add debugging prints to understand the import issue
14
+ import sys
15
+ # print(f"Python path: {sys.path}")
16
+ # print(f"Current working directory: {os.getcwd()}")
17
+ # print(f"Current file location: {__file__}")
18
+ # print(f"Current file directory: {os.path.dirname(__file__)}")
19
+ #
20
+ # # Check if the utils directory exists
21
+ # utils_path = os.path.join(os.path.basename(os.path.dirname(__file__)), "utils")
22
+ # print(f"Utils path: {utils_path}")
23
+ # print(f"Utils directory exists: {os.path.exists(utils_path)}")
24
+ # print(f"Utils directory contents: {os.listdir(utils_path) if os.path.exists(utils_path) else 'Directory not found'}")
25
+ #
26
+ # # Check if dataset.py exists
27
+ # dataset_path = os.path.join(utils_path, "dataset.py")
28
+ # print(f"Dataset file path: {dataset_path}")
29
+ # print(f"Dataset file exists: {os.path.exists(dataset_path)}")
30
+
31
+ # Try different import approaches
32
+
33
+ os.chdir("/home/lihaoyang/Huawei/TSB-AD/TSB_AD")
34
+
35
+ try:
36
+ from utils.dataset import ReconstructDataset
37
+ print("Relative import successful")
38
+ except ImportError as e:
39
+ print(f"Relative import failed: {e}")
40
+
41
+ # Try absolute import
42
+ try:
43
+ from TSB_AD.utils.dataset import ReconstructDataset
44
+ print("Absolute import successful")
45
+ except ImportError as e2:
46
+ print(f"Absolute import failed: {e2}")
47
+
48
+ # Try adding parent directory to path
49
+ try:
50
+ parent_dir = os.path.dirname(os.path.dirname(__file__))
51
+ if parent_dir not in sys.path:
52
+ sys.path.insert(0, parent_dir)
53
+ from utils.dataset import ReconstructDataset
54
+ print("Import with modified path successful")
55
+ except ImportError as e3:
56
+ print(f"Import with modified path failed: {e3}")
57
+
58
+ from .base import BaseDetector
59
+
60
+ # ...existing code...
61
+
62
+ class DADA(BaseDetector):
63
+ def __init__(self, device, args=None, win_size=64, batch_size=32):
64
+ self.win_size = win_size
65
+ self.batch_size = batch_size
66
+ self.device = torch.device(f'cuda:{device}' if torch.cuda.is_available() else 'cpu')
67
+ self.model = self._build_model().to(self.device)
68
+
69
+ def _build_model(self):
70
+ from transformers import AutoModel, AutoConfig
71
+ import os
72
+
73
+ # Try multiple possible paths
74
+ possible_paths = [
75
+ os.environ.get("DADA_MODEL_PATH"), # Environment variable
76
+ "/home/lihaoyang/Huawei/DADA/DADA/", # Original Linux path
77
+ "./DADA", # Relative path
78
+ "DADA" # Hugging Face model name
79
+ ]
80
+
81
+ for path in possible_paths:
82
+ if path is None:
83
+ continue
84
+ try:
85
+ # Try loading config first
86
+ config = AutoConfig.from_pretrained(path, trust_remote_code=True)
87
+ model = AutoModel.from_pretrained(path, config=config, trust_remote_code=True)
88
+ print(f"Successfully loaded DADA model from: {path}")
89
+ return model
90
+ except Exception as e:
91
+ print(f"Failed to load from {path}: {e}")
92
+ continue
93
+
94
+ raise ValueError("DADA model not found. Please set DADA_MODEL_PATH environment variable or ensure the model is available at one of the expected locations.")
95
+
96
+ # def _acquire_device(self):
97
+ # if True:
98
+ # os.environ["CUDA_VISIBLE_DEVICES"] = str(
99
+ # self.args.gpu) if not self.args.use_multi_gpu else self.args.devices
100
+ # device = torch.device('cuda:{}'.format(self.args.gpu))
101
+ # print('Use GPU: cuda:{}'.format(self.args.gpu))
102
+ # else:
103
+ # device = torch.device('cpu')
104
+ # print('Use CPU')
105
+ # return device
106
+
107
+ def decision_function(self, x: torch.Tensor) -> torch.Tensor:
108
+ pass
109
+
110
+ def fit(self, data: torch.Tensor, labels: Optional[torch.Tensor] = None) -> None:
111
+ pass
112
+
113
+ def zero_shot(self, data):
114
+
115
+ test_loader = DataLoader(
116
+ dataset= ReconstructDataset(data, window_size=self.win_size, stride=self.win_size, normalize=True),
117
+ batch_size=self.batch_size,
118
+ shuffle=False)
119
+
120
+ loop = tqdm.tqdm(enumerate(test_loader),total=len(test_loader),leave=True)
121
+
122
+ test_scores = []
123
+ test_labels = []
124
+ self.model.eval()
125
+ self.model.to(self.device)
126
+
127
+ with torch.no_grad():
128
+ for i, (batch_x, batch_y) in loop:
129
+ batch_x = batch_x.float().to(self.device)
130
+ score = self.model.infer(batch_x, norm=0)
131
+ score = score.detach().cpu().numpy()
132
+ test_scores.append(score)
133
+ test_labels.append(batch_y)
134
+
135
+ test_scores = np.concatenate(test_scores, axis=0).reshape(-1, 1)
136
+ test_labels = np.concatenate(test_labels, axis=0).reshape(-1, 1)
137
+
138
+ print("Test scores shape:", test_scores.shape)
139
+ print("Test labels shape:", test_labels.shape)
140
+
141
+ return test_scores.reshape(-1)
models/Donut.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This function is adapted from [donut] by [haowen-xu]
3
+ Original source: [https://github.com/NetManAIOps/donut]
4
+ """
5
+
6
+ from typing import Dict
7
+ import numpy as np
8
+ import torchinfo
9
+ import torch
10
+ from torch import nn, optim
11
+ import tqdm
12
+ import os, math
13
+ import torch.nn.functional as F
14
+ from torch.utils.data import DataLoader
15
+ from typing import Tuple, Sequence, Union, Callable
16
+
17
+ from ..utils.torch_utility import EarlyStoppingTorch, get_gpu
18
+ from ..utils.dataset import ReconstructDataset
19
+
20
+ class DonutModel(nn.Module):
21
+ def __init__(self, input_dim, hidden_dim, latent_dim, mask_prob) -> None:
22
+ super().__init__()
23
+
24
+ """
25
+ Xu2018
26
+
27
+ :param input_dim: Should be window_size * features
28
+ :param hidden_dims:
29
+ :param latent_dim:
30
+ """
31
+
32
+ self.latent_dim = latent_dim
33
+ self.mask_prob = mask_prob
34
+
35
+ encoder = VaeEncoder(input_dim, hidden_dim, latent_dim)
36
+ decoder = VaeEncoder(latent_dim, hidden_dim, input_dim)
37
+
38
+ self.vae = VAE(encoder=encoder, decoder=decoder, logvar_out=False)
39
+
40
+ def forward(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, ...]:
41
+ # x: (B, T, D)
42
+ x = inputs
43
+ B, T, D = x.shape
44
+
45
+ if self.training:
46
+ # Randomly mask some inputs
47
+ mask = torch.empty_like(x)
48
+ mask.bernoulli_(1 - self.mask_prob)
49
+ x = x * mask
50
+ else:
51
+ mask = None
52
+
53
+ # Run the VAE
54
+ x = x.view(x.shape[0], -1)
55
+ mean_z, std_z, mean_x, std_x, sample_z = self.vae(x, return_latent_sample=True)
56
+
57
+ # Reshape the outputs
58
+ mean_x = mean_x.view(B, T, D)
59
+ std_x = std_x.view(B, T, D)
60
+ return mean_z, std_z, mean_x, std_x, sample_z, mask
61
+
62
+ def sample_normal(mu: torch.Tensor, std_or_log_var: torch.Tensor, log_var: bool = False, num_samples: int = 1):
63
+ # ln(σ) = 0.5 * ln(σ^2) -> σ = e^(0.5 * ln(σ^2))
64
+ if log_var:
65
+ sigma = std_or_log_var.mul(0.5).exp_()
66
+ else:
67
+ sigma = std_or_log_var
68
+
69
+ if num_samples == 1:
70
+ eps = torch.randn_like(mu) # also copies device from mu
71
+ else:
72
+ eps = torch.rand((num_samples,) + mu.shape, dtype=mu.dtype, device=mu.device)
73
+ mu = mu.unsqueeze(0)
74
+ sigma = sigma.unsqueeze(0)
75
+ # z = μ + σ * ϵ, with ϵ ~ N(0,I)
76
+ return eps.mul(sigma).add_(mu)
77
+
78
+ def normal_standard_normal_kl(mean: torch.Tensor, std_or_log_var: torch.Tensor, log_var: bool = False) -> torch.Tensor:
79
+ if log_var:
80
+ kl_loss = torch.sum(1 + std_or_log_var - mean.pow(2) - std_or_log_var.exp(), dim=-1)
81
+ else:
82
+ kl_loss = torch.sum(1 + torch.log(std_or_log_var.pow(2)) - mean.pow(2) - std_or_log_var.pow(2), dim=-1)
83
+ return -0.5 * kl_loss
84
+
85
+
86
+ def normal_normal_kl(mean_1: torch.Tensor, std_or_log_var_1: torch.Tensor, mean_2: torch.Tensor,
87
+ std_or_log_var_2: torch.Tensor, log_var: bool = False) -> torch.Tensor:
88
+ if log_var:
89
+ return 0.5 * torch.sum(std_or_log_var_2 - std_or_log_var_1 + (torch.exp(std_or_log_var_1)
90
+ + (mean_1 - mean_2)**2) / torch.exp(std_or_log_var_2) - 1, dim=-1)
91
+
92
+ return torch.sum(torch.log(std_or_log_var_2) - torch.log(std_or_log_var_1) \
93
+ + 0.5 * (std_or_log_var_1**2 + (mean_1 - mean_2)**2) / std_or_log_var_2**2 - 0.5, dim=-1)
94
+
95
+
96
+ class VAELoss(torch.nn.modules.loss._Loss):
97
+ def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', logvar_out: bool = True):
98
+ super(VAELoss, self).__init__(size_average, reduce, reduction)
99
+ self.logvar_out = logvar_out
100
+
101
+ def forward(self, predictions: Tuple[torch.Tensor, ...], targets: Tuple[torch.Tensor, ...], *args, **kwargs) \
102
+ -> torch.Tensor:
103
+ z_mean, z_std_or_log_var, x_dec_mean, x_dec_std = predictions[:4]
104
+ if len(predictions) > 4:
105
+ z_prior_mean, z_prior_std_or_logvar = predictions[4:]
106
+ else:
107
+ z_prior_mean, z_prior_std_or_logvar = None, None
108
+
109
+ y, = targets
110
+
111
+ # Gaussian nnl loss assumes multivariate normal with diagonal sigma
112
+ # Alternatively we can use torch.distribution.Normal(x_dec_mean, x_dec_std).log_prob(y).sum(-1)
113
+ # or torch.distribution.MultivariateNormal(mean, cov).log_prob(y).sum(-1)
114
+ # with cov = torch.eye(feat_dim).repeat([1,bz,1,1])*std.pow(2).unsqueeze(-1).
115
+ # However setting up a distribution seems to be an unnecessary computational overhead.
116
+ # However, this requires pytorch version > 1.9!!!
117
+ nll_gauss = F.gaussian_nll_loss(x_dec_mean, y, x_dec_std.pow(2), reduction='none').sum(-1)
118
+ # For pytorch version < 1.9 use:
119
+ # nll_gauss = -torch.distribution.Normal(x_dec_mean, x_dec_std).log_prob(y).sum(-1)
120
+
121
+ # get KL loss
122
+ if z_prior_mean is None and z_prior_std_or_logvar is None:
123
+ # If a prior is not given, we assume standard normal
124
+ kl_loss = normal_standard_normal_kl(z_mean, z_std_or_log_var, log_var=self.logvar_out)
125
+ else:
126
+ if z_prior_mean is None:
127
+ z_prior_mean = torch.tensor(0, dtype=z_mean.dtype, device=z_mean.device)
128
+ if z_prior_std_or_logvar is None:
129
+ value = 0 if self.logvar_out else 1
130
+ z_prior_std_or_logvar = torch.tensor(value, dtype=z_std_or_log_var.dtype, device=z_std_or_log_var.device)
131
+
132
+ kl_loss = normal_normal_kl(z_mean, z_std_or_log_var, z_prior_mean, z_prior_std_or_logvar,
133
+ log_var=self.logvar_out)
134
+
135
+ # Combine
136
+ final_loss = nll_gauss + kl_loss
137
+
138
+ if self.reduction == 'none':
139
+ return final_loss
140
+ elif self.reduction == 'mean':
141
+ return torch.mean(final_loss)
142
+ elif self.reduction == 'sum':
143
+ return torch.sum(final_loss)
144
+
145
+
146
+ class MaskedVAELoss(VAELoss):
147
+ def __init__(self, size_average=None, reduce=None, reduction: str = 'mean'):
148
+ super(MaskedVAELoss, self).__init__(size_average, reduce, reduction, logvar_out=False)
149
+
150
+ def forward(self, predictions: Tuple[torch.Tensor, ...], targets: Tuple[torch.Tensor, ...], *args, **kwargs) \
151
+ -> torch.Tensor:
152
+ mean_z, std_z, mean_x, std_x, sample_z, mask = predictions
153
+ actual_x, = targets
154
+
155
+ if mask is None:
156
+ mean_z = mean_z.unsqueeze(1)
157
+ std_z = std_z.unsqueeze(1)
158
+ return super(MaskedVAELoss, self).forward((mean_z, std_z, mean_x, std_x), (actual_x,), *args, **kwargs)
159
+
160
+ # If the loss is masked, one of the terms in the kl loss is weighted, so we can't compute it exactly
161
+ # anymore and have to use a MC approximation like for the output likelihood
162
+ nll_output = torch.sum(mask * F.gaussian_nll_loss(mean_x, actual_x, std_x**2, reduction='none'), dim=-1)
163
+
164
+ # This is p(z), i.e., the prior likelihood of Z. The paper assumes p(z) = N(z| 0, I), we drop constants
165
+ beta = torch.mean(mask, dim=(1, 2)).unsqueeze(-1)
166
+ nll_prior = beta * 0.5 * torch.sum(sample_z * sample_z, dim=-1, keepdim=True)
167
+
168
+ nll_approx = torch.sum(F.gaussian_nll_loss(mean_z, sample_z, std_z**2, reduction='none'), dim=-1, keepdim=True)
169
+
170
+ final_loss = nll_output + nll_prior - nll_approx
171
+
172
+ if self.reduction == 'none':
173
+ return final_loss
174
+ elif self.reduction == 'mean':
175
+ return torch.mean(final_loss)
176
+ elif self.reduction == 'sum':
177
+ return torch.sum(final_loss)
178
+
179
+ class MLP(torch.nn.Module):
180
+ def __init__(self, input_features: int, hidden_layers: Union[int, Sequence[int]], output_features: int,
181
+ activation: Callable = torch.nn.Identity(), activation_after_last_layer: bool = False):
182
+ super(MLP, self).__init__()
183
+
184
+ self.activation = activation
185
+ self.activation_after_last_layer = activation_after_last_layer
186
+
187
+ if isinstance(hidden_layers, int):
188
+ hidden_layers = [hidden_layers]
189
+
190
+ layers = [input_features] + list(hidden_layers) + [output_features]
191
+ self.layers = torch.nn.ModuleList([torch.nn.Linear(inp, out) for inp, out in zip(layers[:-1], layers[1:])])
192
+
193
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
194
+ out = x
195
+ for layer in self.layers[:-1]:
196
+ out = layer(out)
197
+ out = self.activation(out)
198
+
199
+ out = self.layers[-1](out)
200
+ if self.activation_after_last_layer:
201
+ out = self.activation(out)
202
+
203
+ return out
204
+
205
+ class VaeEncoder(nn.Module):
206
+ def __init__(self, input_dim: int, hidden_dim: int, latent_dim: int):
207
+ super(VaeEncoder, self).__init__()
208
+
209
+ self.latent_dim = latent_dim
210
+
211
+ self.mlp = MLP(input_dim, hidden_dim, 2*latent_dim, activation=torch.nn.ReLU(), activation_after_last_layer=False)
212
+ self.softplus = torch.nn.Softplus()
213
+
214
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
215
+ # x: (B, T, D)
216
+ mlp_out = self.mlp(x)
217
+
218
+ mean, std = mlp_out.tensor_split(2, dim=-1)
219
+ std = self.softplus(std)
220
+
221
+ return mean, std
222
+
223
+ class VAE(torch.nn.Module):
224
+ """
225
+ VAE Implementation that supports normal distribution with diagonal cov matrix in the latent space
226
+ and the output
227
+ """
228
+
229
+ def __init__(self, encoder: torch.nn.Module, decoder: torch.nn.Module, logvar_out: bool = True):
230
+ super(VAE, self).__init__()
231
+
232
+ self.encoder = encoder
233
+ self.decoder = decoder
234
+ self.log_var = logvar_out
235
+
236
+ def forward(self, x: torch.Tensor, return_latent_sample: bool = False, num_samples: int = 1,
237
+ force_sample: bool = False) -> Tuple[torch.Tensor, ...]:
238
+ z_mu, z_std_or_log_var = self.encoder(x)
239
+
240
+ if self.training or num_samples > 1 or force_sample:
241
+ z_sample = sample_normal(z_mu, z_std_or_log_var, log_var=self.log_var, num_samples=num_samples)
242
+ else:
243
+ z_sample = z_mu
244
+
245
+ x_dec_mean, x_dec_std = self.decoder(z_sample)
246
+
247
+ if not return_latent_sample:
248
+ return z_mu, z_std_or_log_var, x_dec_mean, x_dec_std
249
+
250
+ return z_mu, z_std_or_log_var, x_dec_mean, x_dec_std, z_sample
251
+
252
+
253
+
254
+ class Donut():
255
+ def __init__(self,
256
+ win_size=120,
257
+ input_c=1,
258
+ batch_size=128, # 32, 128
259
+ grad_clip=10.0,
260
+ num_epochs=50,
261
+ mc_samples=1024,
262
+ hidden_dim=100,
263
+ latent_dim=8,
264
+ inject_ratio=0.01,
265
+ lr=1e-4,
266
+ l2_coff=1e-3,
267
+ patience=3,
268
+ validation_size=0):
269
+ super().__init__()
270
+ self.__anomaly_score = None
271
+
272
+ self.cuda = True
273
+ self.device = get_gpu(self.cuda)
274
+
275
+ self.win_size = win_size
276
+ self.input_c = input_c
277
+ self.batch_size = batch_size
278
+ self.grad_clip = grad_clip
279
+ self.num_epochs = num_epochs
280
+ self.mc_samples = mc_samples
281
+ self.validation_size = validation_size
282
+
283
+ input_dim = self.win_size*self.input_c
284
+
285
+ self.model = DonutModel(input_dim=input_dim, hidden_dim=hidden_dim, latent_dim=latent_dim, mask_prob=inject_ratio).to(self.device)
286
+ self.optimizer = optim.AdamW(self.model.parameters(), lr=lr, weight_decay=l2_coff)
287
+ self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=10, gamma=0.75)
288
+ self.vaeloss = MaskedVAELoss()
289
+
290
+ self.save_path = None
291
+ self.early_stopping = EarlyStoppingTorch(save_path=self.save_path, patience=patience)
292
+
293
+ def train(self, train_loader, epoch):
294
+ self.model.train(mode=True)
295
+ avg_loss = 0
296
+ loop = tqdm.tqdm(enumerate(train_loader),total=len(train_loader),leave=True)
297
+ for idx, (x, target) in loop:
298
+ x, target = x.to(self.device), target.to(self.device)
299
+ self.optimizer.zero_grad()
300
+
301
+ # print('x: ', x.shape)
302
+
303
+ output = self.model(x)
304
+ loss = self.vaeloss(output, (target,))
305
+ loss.backward()
306
+
307
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
308
+ self.optimizer.step()
309
+
310
+ avg_loss += loss.cpu().item()
311
+ loop.set_description(f'Training Epoch [{epoch}/{self.num_epochs}]')
312
+ loop.set_postfix(loss=loss.item(), avg_loss=avg_loss/(idx+1))
313
+
314
+ return avg_loss/max(len(train_loader), 1)
315
+
316
+ def valid(self, valid_loader, epoch):
317
+ self.model.eval()
318
+ avg_loss = 0
319
+ loop = tqdm.tqdm(enumerate(valid_loader),total=len(valid_loader),leave=True)
320
+ with torch.no_grad():
321
+ for idx, (x, target) in loop:
322
+ x, target = x.to(self.device), target.to(self.device)
323
+ output = self.model(x)
324
+ loss = self.vaeloss(output, (target,))
325
+ avg_loss += loss.cpu().item()
326
+ loop.set_description(f'Validation Epoch [{epoch}/{self.num_epochs}]')
327
+ loop.set_postfix(loss=loss.item(), avg_loss=avg_loss/(idx+1))
328
+
329
+ return avg_loss/max(len(valid_loader), 1)
330
+
331
+ def fit(self, data):
332
+ tsTrain = data[:int((1-self.validation_size)*len(data))]
333
+ tsValid = data[int((1-self.validation_size)*len(data)):]
334
+
335
+ train_loader = DataLoader(
336
+ dataset=ReconstructDataset(tsTrain, window_size=self.win_size),
337
+ batch_size=self.batch_size,
338
+ shuffle=True
339
+ )
340
+
341
+ valid_loader = DataLoader(
342
+ dataset=ReconstructDataset(tsValid, window_size=self.win_size),
343
+ batch_size=self.batch_size,
344
+ shuffle=False
345
+ )
346
+
347
+ for epoch in range(1, self.num_epochs + 1):
348
+ train_loss = self.train(train_loader, epoch)
349
+ if len(valid_loader) > 0:
350
+ valid_loss = self.valid(valid_loader, epoch)
351
+ self.scheduler.step()
352
+
353
+ if len(valid_loader) > 0:
354
+ self.early_stopping(valid_loss, self.model)
355
+ else:
356
+ self.early_stopping(train_loss, self.model)
357
+ if self.early_stopping.early_stop:
358
+ print(" Early stopping<<<")
359
+ break
360
+
361
+
362
+ def decision_function(self, data):
363
+
364
+ test_loader = DataLoader(
365
+ dataset=ReconstructDataset(data, window_size=self.win_size),
366
+ batch_size=self.batch_size,
367
+ shuffle=False
368
+ )
369
+
370
+ self.model.eval()
371
+ scores = []
372
+ loop = tqdm.tqdm(enumerate(test_loader),total=len(test_loader),leave=True)
373
+ with torch.no_grad():
374
+ for idx, (x, _) in loop:
375
+ x = x.to(self.device)
376
+ x_vae = x.view(x.shape[0], -1)
377
+ B, T, D = x.shape
378
+
379
+ res = self.model.vae(x_vae, return_latent_sample=False, num_samples=self.mc_samples)
380
+ z_mu, z_std, x_dec_mean, x_dec_std = res
381
+
382
+ x_dec_mean = x_dec_mean.view(self.mc_samples, B, T, D)
383
+ x_dec_std = x_dec_std.view(self.mc_samples, B, T, D)
384
+ nll_output = torch.sum(F.gaussian_nll_loss(x_dec_mean[:, :, -1, :], x[:, -1, :].unsqueeze(0),
385
+ x_dec_std[:, :, -1, :]**2, reduction='none'), dim=(0, 2))
386
+ nll_output /= self.mc_samples
387
+
388
+
389
+ scores.append(nll_output.cpu())
390
+ loop.set_description(f'Testing: ')
391
+
392
+ scores = torch.cat(scores, dim=0)
393
+ scores = scores.numpy()
394
+
395
+ assert scores.ndim == 1
396
+
397
+ import shutil
398
+ if self.save_path and os.path.exists(self.save_path):
399
+ shutil.rmtree(self.save_path)
400
+
401
+ self.__anomaly_score = scores
402
+
403
+ if self.__anomaly_score.shape[0] < len(data):
404
+ self.__anomaly_score = np.array([self.__anomaly_score[0]]*math.ceil((self.win_size-1)/2) +
405
+ list(self.__anomaly_score) + [self.__anomaly_score[-1]]*((self.win_size-1)//2))
406
+
407
+ return self.__anomaly_score
408
+
409
+ def anomaly_score(self) -> np.ndarray:
410
+ return self.__anomaly_score
411
+
412
+ def get_y_hat(self) -> np.ndarray:
413
+ return super().get_y_hat
414
+
415
+ def param_statistic(self, save_file):
416
+ model_stats = torchinfo.summary(self.model, (self.batch_size, self.win_size), verbose=0)
417
+ with open(save_file, 'w') as f:
418
+ f.write(str(model_stats))
419
+