Spaces:
Running
Running
Oliver Le
commited on
Commit
·
d03866e
0
Parent(s):
Initial commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +35 -0
- .gitignore +1 -0
- HP_list.py +283 -0
- README.md +13 -0
- app.py +237 -0
- evaluation/.DS_Store +0 -0
- evaluation/__init__.py +1 -0
- evaluation/affiliation/__init__.py +1 -0
- evaluation/affiliation/__pycache__/__init__.cpython-310.pyc +0 -0
- evaluation/affiliation/__pycache__/__init__.cpython-311.pyc +0 -0
- evaluation/affiliation/__pycache__/__init__.cpython-38.pyc +0 -0
- evaluation/affiliation/__pycache__/__init__.cpython-39.pyc +0 -0
- evaluation/affiliation/__pycache__/_affiliation_zone.cpython-310.pyc +0 -0
- evaluation/affiliation/__pycache__/_affiliation_zone.cpython-311.pyc +0 -0
- evaluation/affiliation/__pycache__/_affiliation_zone.cpython-38.pyc +0 -0
- evaluation/affiliation/__pycache__/_affiliation_zone.cpython-39.pyc +0 -0
- evaluation/affiliation/__pycache__/_integral_interval.cpython-310.pyc +0 -0
- evaluation/affiliation/__pycache__/_integral_interval.cpython-311.pyc +0 -0
- evaluation/affiliation/__pycache__/_integral_interval.cpython-38.pyc +0 -0
- evaluation/affiliation/__pycache__/_integral_interval.cpython-39.pyc +0 -0
- evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-310.pyc +0 -0
- evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-311.pyc +0 -0
- evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-38.pyc +0 -0
- evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-39.pyc +0 -0
- evaluation/affiliation/__pycache__/generics.cpython-310.pyc +0 -0
- evaluation/affiliation/__pycache__/generics.cpython-311.pyc +0 -0
- evaluation/affiliation/__pycache__/generics.cpython-38.pyc +0 -0
- evaluation/affiliation/__pycache__/generics.cpython-39.pyc +0 -0
- evaluation/affiliation/__pycache__/metrics.cpython-310.pyc +0 -0
- evaluation/affiliation/__pycache__/metrics.cpython-311.pyc +0 -0
- evaluation/affiliation/__pycache__/metrics.cpython-38.pyc +0 -0
- evaluation/affiliation/__pycache__/metrics.cpython-39.pyc +0 -0
- evaluation/affiliation/_affiliation_zone.py +86 -0
- evaluation/affiliation/_integral_interval.py +464 -0
- evaluation/affiliation/_single_ground_truth_event.py +68 -0
- evaluation/affiliation/generics.py +135 -0
- evaluation/affiliation/metrics.py +116 -0
- evaluation/basic_metrics.py +0 -0
- evaluation/metrics.py +379 -0
- evaluation/visualize.py +99 -0
- model_wrapper.py +532 -0
- models/.DS_Store +0 -0
- models/AE.py +407 -0
- models/CBLOF.py +332 -0
- models/CNN.py +273 -0
- models/COF.py +211 -0
- models/COPOD.py +205 -0
- models/Chronos.py +94 -0
- models/DADA.py +141 -0
- models/Donut.py +419 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
models/granite_tsfm
|
HP_list.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Multi_algo_HP_dict = {
|
| 2 |
+
'IForest': {
|
| 3 |
+
'n_estimators': [25, 50, 100, 150, 200],
|
| 4 |
+
'max_features': [0.2, 0.4, 0.6, 0.8, 1.0]
|
| 5 |
+
},
|
| 6 |
+
'LOF': {
|
| 7 |
+
'n_neighbors': [10, 20, 30, 40, 50],
|
| 8 |
+
'metric': ['minkowski', 'manhattan', 'euclidean']
|
| 9 |
+
},
|
| 10 |
+
'PCA': {
|
| 11 |
+
'n_components': [0.25, 0.5, 0.75, None]
|
| 12 |
+
},
|
| 13 |
+
'HBOS': {
|
| 14 |
+
'n_bins': [5, 10, 20, 30, 40],
|
| 15 |
+
'tol': [0.1, 0.3, 0.5, 0.7]
|
| 16 |
+
},
|
| 17 |
+
'OCSVM': {
|
| 18 |
+
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
|
| 19 |
+
'nu': [0.1, 0.3, 0.5, 0.7]
|
| 20 |
+
},
|
| 21 |
+
'MCD': {
|
| 22 |
+
'support_fraction': [0.2, 0.4, 0.6, 0.8, None]
|
| 23 |
+
},
|
| 24 |
+
'KNN': {
|
| 25 |
+
'n_neighbors': [10, 20, 30, 40, 50],
|
| 26 |
+
'method': ['largest', 'mean', 'median']
|
| 27 |
+
},
|
| 28 |
+
'KMeansAD': {
|
| 29 |
+
'n_clusters': [10, 20, 30, 40],
|
| 30 |
+
'window_size': [10, 20, 30, 40]
|
| 31 |
+
},
|
| 32 |
+
'COPOD': {
|
| 33 |
+
'HP': [None]
|
| 34 |
+
},
|
| 35 |
+
'CBLOF': {
|
| 36 |
+
'n_clusters': [4, 8, 16, 32],
|
| 37 |
+
'alpha': [0.6, 0.7, 0.8, 0.9]
|
| 38 |
+
},
|
| 39 |
+
'EIF': {
|
| 40 |
+
'n_trees': [25, 50, 100, 200]
|
| 41 |
+
},
|
| 42 |
+
'RobustPCA': {
|
| 43 |
+
'max_iter': [500, 1000, 1500]
|
| 44 |
+
},
|
| 45 |
+
'AutoEncoder': {
|
| 46 |
+
'hidden_neurons': [[64, 32], [32, 16], [128, 64]]
|
| 47 |
+
},
|
| 48 |
+
'CNN': {
|
| 49 |
+
'window_size': [50, 100, 150],
|
| 50 |
+
'num_channel': [[32, 32, 40], [16, 32, 64]]
|
| 51 |
+
},
|
| 52 |
+
'LSTMAD': {
|
| 53 |
+
'window_size': [50, 100, 150],
|
| 54 |
+
'lr': [0.0004, 0.0008]
|
| 55 |
+
},
|
| 56 |
+
'TranAD': {
|
| 57 |
+
'win_size': [5, 10, 50],
|
| 58 |
+
'lr': [1e-3, 1e-4]
|
| 59 |
+
},
|
| 60 |
+
'AnomalyTransformer': {
|
| 61 |
+
'win_size': [50, 100, 150],
|
| 62 |
+
'lr': [1e-3, 1e-4, 1e-5]
|
| 63 |
+
},
|
| 64 |
+
'OmniAnomaly': {
|
| 65 |
+
'win_size': [5, 50, 100],
|
| 66 |
+
'lr': [0.002, 0.0002]
|
| 67 |
+
},
|
| 68 |
+
'USAD': {
|
| 69 |
+
'win_size': [5, 50, 100],
|
| 70 |
+
'lr': [1e-3, 1e-4, 1e-5]
|
| 71 |
+
},
|
| 72 |
+
'Donut': {
|
| 73 |
+
'win_size': [60, 90, 120],
|
| 74 |
+
'lr': [1e-3, 1e-4, 1e-5]
|
| 75 |
+
},
|
| 76 |
+
'TimesNet': {
|
| 77 |
+
'win_size': [32, 96, 192],
|
| 78 |
+
'lr': [1e-3, 1e-4, 1e-5]
|
| 79 |
+
},
|
| 80 |
+
'FITS': {
|
| 81 |
+
'win_size': [100, 200],
|
| 82 |
+
'lr': [1e-3, 1e-4, 1e-5]
|
| 83 |
+
},
|
| 84 |
+
'OFA': {
|
| 85 |
+
'win_size': [50, 100, 150]
|
| 86 |
+
},
|
| 87 |
+
'Time_RCD': {
|
| 88 |
+
'win_size': 7000
|
| 89 |
+
},
|
| 90 |
+
'TSPulse': {
|
| 91 |
+
'win_size': [64, 128, 256],
|
| 92 |
+
'batch_size': [32, 64, 128],
|
| 93 |
+
'aggregation_length': [32, 64, 128],
|
| 94 |
+
'aggr_function': ['max', 'mean'],
|
| 95 |
+
'smoothing_length': [4, 8, 16]
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
Optimal_Multi_algo_HP_dict = {
|
| 101 |
+
'IForest': {'n_estimators': 25, 'max_features': 0.8},
|
| 102 |
+
'LOF': {'n_neighbors': 50, 'metric': 'euclidean'},
|
| 103 |
+
'PCA': {'n_components': 0.25},
|
| 104 |
+
'HBOS': {'n_bins': 30, 'tol': 0.5},
|
| 105 |
+
'OCSVM': {'kernel': 'rbf', 'nu': 0.1},
|
| 106 |
+
'MCD': {'support_fraction': 0.8},
|
| 107 |
+
'KNN': {'n_neighbors': 50, 'method': 'mean'},
|
| 108 |
+
'KMeansAD': {'n_clusters': 10, 'window_size': 40},
|
| 109 |
+
'KShapeAD': {'n_clusters': 20, 'window_size': 40},
|
| 110 |
+
'COPOD': {'n_jobs':1},
|
| 111 |
+
'CBLOF': {'n_clusters': 4, 'alpha': 0.6},
|
| 112 |
+
'EIF': {'n_trees': 50},
|
| 113 |
+
'RobustPCA': {'max_iter': 1000},
|
| 114 |
+
'AutoEncoder': {'hidden_neurons': [128, 64]},
|
| 115 |
+
'CNN': {'window_size': 50, 'num_channel': [32, 32, 40]},
|
| 116 |
+
'LSTMAD': {'window_size': 150, 'lr': 0.0008},
|
| 117 |
+
'TranAD': {'win_size': 10, 'lr': 0.001},
|
| 118 |
+
'AnomalyTransformer': {'win_size': 50, 'lr': 0.001},
|
| 119 |
+
'OmniAnomaly': {'win_size': 100, 'lr': 0.002},
|
| 120 |
+
'USAD': {'win_size': 100, 'lr': 0.001},
|
| 121 |
+
'Donut': {'win_size': 60, 'lr': 0.001},
|
| 122 |
+
'TimesNet': {'win_size': 96, 'lr': 0.0001},
|
| 123 |
+
'FITS': {'win_size': 100, 'lr': 0.001},
|
| 124 |
+
'OFA': {'win_size': 50},
|
| 125 |
+
'Time_RCD': {'win_size':5000, 'batch_size': 1},
|
| 126 |
+
'DADA': {'win_size': 100, 'batch_size': 64},
|
| 127 |
+
'TSPulse': {'win_size': 96 , 'batch_size': 64, 'aggregation_length': 64, 'aggr_function': 'max', 'smoothing_length': 8}
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
Uni_algo_HP_dict = {
|
| 132 |
+
'Sub_IForest': {
|
| 133 |
+
'periodicity': [1, 2, 3],
|
| 134 |
+
'n_estimators': [25, 50, 100, 150, 200]
|
| 135 |
+
},
|
| 136 |
+
'IForest': {
|
| 137 |
+
'n_estimators': [25, 50, 100, 150, 200]
|
| 138 |
+
},
|
| 139 |
+
'Sub_LOF': {
|
| 140 |
+
'periodicity': [1, 2, 3],
|
| 141 |
+
'n_neighbors': [10, 20, 30, 40, 50]
|
| 142 |
+
},
|
| 143 |
+
'LOF': {
|
| 144 |
+
'n_neighbors': [10, 20, 30, 40, 50]
|
| 145 |
+
},
|
| 146 |
+
'POLY': {
|
| 147 |
+
'periodicity': [1, 2, 3],
|
| 148 |
+
'power': [1, 2, 3, 4]
|
| 149 |
+
},
|
| 150 |
+
'MatrixProfile': {
|
| 151 |
+
'periodicity': [1, 2, 3]
|
| 152 |
+
},
|
| 153 |
+
'NORMA': {
|
| 154 |
+
'periodicity': [1, 2, 3],
|
| 155 |
+
'clustering': ['hierarchical', 'kshape']
|
| 156 |
+
},
|
| 157 |
+
'SAND': {
|
| 158 |
+
'periodicity': [1, 2, 3]
|
| 159 |
+
},
|
| 160 |
+
'Series2Graph': {
|
| 161 |
+
'periodicity': [1, 2, 3]
|
| 162 |
+
},
|
| 163 |
+
'Sub_PCA': {
|
| 164 |
+
'periodicity': [1, 2, 3],
|
| 165 |
+
'n_components': [0.25, 0.5, 0.75, None]
|
| 166 |
+
},
|
| 167 |
+
'Sub_HBOS': {
|
| 168 |
+
'periodicity': [1, 2, 3],
|
| 169 |
+
'n_bins': [5, 10, 20, 30, 40]
|
| 170 |
+
},
|
| 171 |
+
'Sub_OCSVM': {
|
| 172 |
+
'periodicity': [1, 2, 3],
|
| 173 |
+
'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
|
| 174 |
+
},
|
| 175 |
+
'Sub_MCD': {
|
| 176 |
+
'periodicity': [1, 2, 3],
|
| 177 |
+
'support_fraction': [0.2, 0.4, 0.6, 0.8, None]
|
| 178 |
+
},
|
| 179 |
+
'Sub_KNN': {
|
| 180 |
+
'periodicity': [1, 2, 3],
|
| 181 |
+
'n_neighbors': [10, 20, 30, 40, 50],
|
| 182 |
+
},
|
| 183 |
+
'KMeansAD_U': {
|
| 184 |
+
'periodicity': [1, 2, 3],
|
| 185 |
+
'n_clusters': [10, 20, 30, 40],
|
| 186 |
+
},
|
| 187 |
+
'KShapeAD': {
|
| 188 |
+
'periodicity': [1, 2, 3]
|
| 189 |
+
},
|
| 190 |
+
'AutoEncoder': {
|
| 191 |
+
'window_size': [50, 100, 150],
|
| 192 |
+
'hidden_neurons': [[64, 32], [32, 16], [128, 64]]
|
| 193 |
+
},
|
| 194 |
+
'CNN': {
|
| 195 |
+
'window_size': [50, 100, 150],
|
| 196 |
+
'num_channel': [[32, 32, 40], [16, 32, 64]]
|
| 197 |
+
},
|
| 198 |
+
'LSTMAD': {
|
| 199 |
+
'window_size': [50, 100, 150],
|
| 200 |
+
'lr': [0.0004, 0.0008]
|
| 201 |
+
},
|
| 202 |
+
'TranAD': {
|
| 203 |
+
'win_size': [5, 10, 50],
|
| 204 |
+
'lr': [1e-3, 1e-4]
|
| 205 |
+
},
|
| 206 |
+
'AnomalyTransformer': {
|
| 207 |
+
'win_size': [50, 100, 150],
|
| 208 |
+
'lr': [1e-3, 1e-4, 1e-5]
|
| 209 |
+
},
|
| 210 |
+
'OmniAnomaly': {
|
| 211 |
+
'win_size': [5, 50, 100],
|
| 212 |
+
'lr': [0.002, 0.0002]
|
| 213 |
+
},
|
| 214 |
+
'USAD': {
|
| 215 |
+
'win_size': [5, 50, 100],
|
| 216 |
+
'lr': [1e-3, 1e-4, 1e-5]
|
| 217 |
+
},
|
| 218 |
+
'Donut': {
|
| 219 |
+
'win_size': [60, 90, 120],
|
| 220 |
+
'lr': [1e-3, 1e-4, 1e-5]
|
| 221 |
+
},
|
| 222 |
+
'TimesNet': {
|
| 223 |
+
'win_size': [32, 96, 192],
|
| 224 |
+
'lr': [1e-3, 1e-4, 1e-5]
|
| 225 |
+
},
|
| 226 |
+
'FITS': {
|
| 227 |
+
'win_size': [100, 200],
|
| 228 |
+
'lr': [1e-3, 1e-4, 1e-5]
|
| 229 |
+
},
|
| 230 |
+
'OFA': {
|
| 231 |
+
'win_size': [50, 100, 150]
|
| 232 |
+
},
|
| 233 |
+
# 'Time_RCD': {
|
| 234 |
+
# 'win_size': [1000, 2000, 3000, 4000, 5000, 6000, 8000, 10000],
|
| 235 |
+
# 'batch_size': [32, 64, 128]
|
| 236 |
+
# }
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
Optimal_Uni_algo_HP_dict = {
|
| 240 |
+
'Sub_IForest': {'periodicity': 1, 'n_estimators': 150},
|
| 241 |
+
'IForest': {'n_estimators': 200},
|
| 242 |
+
'Sub_LOF': {'periodicity': 2, 'n_neighbors': 30},
|
| 243 |
+
'LOF': {'n_neighbors': 50},
|
| 244 |
+
'POLY': {'periodicity': 1, 'power': 4},
|
| 245 |
+
'MatrixProfile': {'periodicity': 1},
|
| 246 |
+
'NORMA': {'periodicity': 1, 'clustering': 'kshape'},
|
| 247 |
+
'SAND': {'periodicity': 1},
|
| 248 |
+
'Series2Graph': {'periodicity': 1},
|
| 249 |
+
'SR': {'periodicity': 1},
|
| 250 |
+
'Sub_PCA': {'periodicity': 1, 'n_components': None},
|
| 251 |
+
'Sub_HBOS': {'periodicity': 1, 'n_bins': 10},
|
| 252 |
+
'Sub_OCSVM': {'periodicity': 2, 'kernel': 'rbf'},
|
| 253 |
+
'Sub_MCD': {'periodicity': 3, 'support_fraction': None},
|
| 254 |
+
'Sub_KNN': {'periodicity': 2, 'n_neighbors': 50},
|
| 255 |
+
'KMeansAD_U': {'periodicity': 2, 'n_clusters': 10},
|
| 256 |
+
'KShapeAD': {'periodicity': 1},
|
| 257 |
+
'FFT': {},
|
| 258 |
+
'Left_STAMPi': {},
|
| 259 |
+
'AutoEncoder': {'window_size': 100, 'hidden_neurons': [128, 64]},
|
| 260 |
+
'CNN': {'window_size': 50, 'num_channel': [32, 32, 40]},
|
| 261 |
+
'LSTMAD': {'window_size': 100, 'lr': 0.0008},
|
| 262 |
+
'TranAD': {'win_size': 10, 'lr': 0.0001},
|
| 263 |
+
'AnomalyTransformer': {'win_size': 50, 'lr': 0.001},
|
| 264 |
+
'OmniAnomaly': {'win_size': 5, 'lr': 0.002},
|
| 265 |
+
'USAD': {'win_size': 100, 'lr': 0.001},
|
| 266 |
+
'Donut': {'win_size': 60, 'lr': 0.0001},
|
| 267 |
+
'TimesNet': {'win_size': 32, 'lr': 0.0001},
|
| 268 |
+
'FITS': {'win_size': 100, 'lr': 0.0001},
|
| 269 |
+
'OFA': {'win_size': 50},
|
| 270 |
+
'Lag_Llama': {'win_size': 96},
|
| 271 |
+
'Chronos': {'win_size': 100},
|
| 272 |
+
'TimesFM': {'win_size': 96},
|
| 273 |
+
'MOMENT_ZS': {'win_size': 64},
|
| 274 |
+
'MOMENT_FT': {'win_size': 64},
|
| 275 |
+
'M2N2': {},
|
| 276 |
+
'DADA': {'win_size': 100},
|
| 277 |
+
'Time_MOE': {'win_size':96},
|
| 278 |
+
'Time_RCD': {'win_size':5000, 'batch_size': 64},
|
| 279 |
+
'Time_RCD_Reconstruction': {'win_size':5000, 'batch_size': 128},
|
| 280 |
+
'Time_RCD_Reconstruction_Anomaly_Head': {'win_size':5000, 'batch_size': 128},
|
| 281 |
+
'Time_RCD_Reconstruction_Random_Mask_Anomaly_Head': {'win_size':5000, 'batch_size': 128},
|
| 282 |
+
'TSPulse': {'win_size':96, 'batch_size': 64, 'aggregation_length': 64, 'aggr_function': 'max', 'smoothing_length': 8}
|
| 283 |
+
}
|
README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Time RCD
|
| 3 |
+
emoji: 🐠
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.49.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import zipfile
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import List, Tuple
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import matplotlib
|
| 8 |
+
|
| 9 |
+
matplotlib.use("Agg")
|
| 10 |
+
import matplotlib.pyplot as plt
|
| 11 |
+
import numpy as np
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from huggingface_hub import HfHubHTTPError, hf_hub_download
|
| 14 |
+
|
| 15 |
+
from model_wrapper import run_Time_RCD
|
| 16 |
+
|
| 17 |
+
REPO_ID = "thu-sail-lab/Time-RCD"
|
| 18 |
+
|
| 19 |
+
CHECKPOINT_FILES = [
|
| 20 |
+
"checkpoints/full_mask_anomaly_head_pretrain_checkpoint_best.pth",
|
| 21 |
+
"checkpoints/dataset_10_20.pth",
|
| 22 |
+
"checkpoints/full_mask_10_20.pth",
|
| 23 |
+
"checkpoints/dataset_15_56.pth",
|
| 24 |
+
"checkpoints/full_mask_15_56.pth",
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def ensure_checkpoints() -> None:
|
| 29 |
+
"""Ensure that the required checkpoint files are present locally."""
|
| 30 |
+
missing = [path for path in CHECKPOINT_FILES if not Path(path).exists()]
|
| 31 |
+
if not missing:
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
zip_path = hf_hub_download(
|
| 36 |
+
repo_id=REPO_ID,
|
| 37 |
+
filename="checkpoints.zip",
|
| 38 |
+
repo_type="model",
|
| 39 |
+
cache_dir=".cache/hf",
|
| 40 |
+
)
|
| 41 |
+
except HfHubHTTPError:
|
| 42 |
+
zip_path = hf_hub_download(
|
| 43 |
+
repo_id=REPO_ID,
|
| 44 |
+
filename="checkpoints.zip",
|
| 45 |
+
repo_type="dataset",
|
| 46 |
+
cache_dir=".cache/hf",
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
with zipfile.ZipFile(zip_path, "r") as zf:
|
| 50 |
+
zf.extractall(".")
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def load_timeseries(file_obj, feature_columns: List[str] | None = None) -> Tuple[pd.DataFrame, np.ndarray]:
|
| 54 |
+
"""Load the uploaded file into a numeric dataframe and numpy array."""
|
| 55 |
+
path = Path(file_obj.name)
|
| 56 |
+
if path.suffix.lower() == ".npy":
|
| 57 |
+
data = np.load(path, allow_pickle=False)
|
| 58 |
+
if data.ndim == 1:
|
| 59 |
+
data = data.reshape(-1, 1)
|
| 60 |
+
if not isinstance(data, np.ndarray):
|
| 61 |
+
raise ValueError("Loaded data is not a numpy array.")
|
| 62 |
+
df = pd.DataFrame(data)
|
| 63 |
+
return df, data.astype(np.float32)
|
| 64 |
+
|
| 65 |
+
if path.suffix.lower() not in {".csv", ".txt"}:
|
| 66 |
+
raise ValueError("Unsupported file type. Please upload a .csv, .txt, or .npy file.")
|
| 67 |
+
|
| 68 |
+
df = pd.read_csv(path)
|
| 69 |
+
numeric_df = df.select_dtypes(include=np.number)
|
| 70 |
+
if numeric_df.empty:
|
| 71 |
+
raise ValueError("No numeric columns detected. Ensure your file contains numeric values.")
|
| 72 |
+
|
| 73 |
+
if feature_columns:
|
| 74 |
+
missing = [col for col in feature_columns if col not in numeric_df.columns]
|
| 75 |
+
if missing:
|
| 76 |
+
raise ValueError(f"Selected columns not found in the file: {', '.join(missing)}")
|
| 77 |
+
numeric_df = numeric_df[feature_columns]
|
| 78 |
+
|
| 79 |
+
array = numeric_df.to_numpy(dtype=np.float32)
|
| 80 |
+
if array.ndim == 1:
|
| 81 |
+
array = array.reshape(-1, 1)
|
| 82 |
+
|
| 83 |
+
return numeric_df, array
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def infer(
|
| 87 |
+
file_obj,
|
| 88 |
+
is_multivariate: bool,
|
| 89 |
+
window_size: int,
|
| 90 |
+
batch_size: int,
|
| 91 |
+
mask_type: str,
|
| 92 |
+
multi_size: str,
|
| 93 |
+
feature_columns: List[str],
|
| 94 |
+
) -> Tuple[str, pd.DataFrame, plt.Figure]:
|
| 95 |
+
"""Run Time-RCD inference and produce outputs for the Gradio UI."""
|
| 96 |
+
ensure_checkpoints()
|
| 97 |
+
numeric_df, array = load_timeseries(file_obj, feature_columns or None)
|
| 98 |
+
|
| 99 |
+
kwargs = {
|
| 100 |
+
"Multi": is_multivariate,
|
| 101 |
+
"win_size": window_size,
|
| 102 |
+
"batch_size": batch_size,
|
| 103 |
+
"random_mask": mask_type,
|
| 104 |
+
"size": multi_size,
|
| 105 |
+
"device": "cpu",
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
scores, logits = run_Time_RCD(array, **kwargs)
|
| 109 |
+
score_vector = np.asarray(scores).reshape(-1)
|
| 110 |
+
logit_vector = np.asarray(logits).reshape(-1)
|
| 111 |
+
|
| 112 |
+
valid_length = min(len(score_vector), len(numeric_df))
|
| 113 |
+
score_series = pd.Series(score_vector[:valid_length], index=numeric_df.index[:valid_length], name="anomaly_score")
|
| 114 |
+
logit_series = pd.Series(logit_vector[:valid_length], index=numeric_df.index[:valid_length], name="anomaly_logit")
|
| 115 |
+
|
| 116 |
+
result_df = numeric_df.iloc[:valid_length, :].copy()
|
| 117 |
+
result_df["anomaly_score"] = score_series
|
| 118 |
+
result_df["anomaly_logit"] = logit_series
|
| 119 |
+
|
| 120 |
+
top_indices = score_series.nlargest(5).index.tolist()
|
| 121 |
+
highlight_message = (
|
| 122 |
+
"Top anomaly indices (by score): " + ", ".join(str(idx) for idx in top_indices)
|
| 123 |
+
if len(top_indices) > 0
|
| 124 |
+
else "No anomalies detected."
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
figure = build_plot(result_df)
|
| 128 |
+
|
| 129 |
+
return highlight_message, result_df, figure
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def build_plot(result_df: pd.DataFrame) -> plt.Figure:
|
| 133 |
+
"""Create a matplotlib plot of the first feature vs. anomaly score."""
|
| 134 |
+
fig, ax_primary = plt.subplots(figsize=(10, 4))
|
| 135 |
+
index = result_df.index
|
| 136 |
+
feature_cols = [col for col in result_df.columns if col not in {"anomaly_score", "anomaly_logit"}]
|
| 137 |
+
|
| 138 |
+
primary_col = feature_cols[0]
|
| 139 |
+
ax_primary.plot(index, result_df[primary_col], label=f"{primary_col}", color="#1f77b4", linewidth=1.0)
|
| 140 |
+
ax_primary.set_xlabel("Index")
|
| 141 |
+
ax_primary.set_ylabel("Value")
|
| 142 |
+
ax_primary.grid(alpha=0.2)
|
| 143 |
+
|
| 144 |
+
ax_secondary = ax_primary.twinx()
|
| 145 |
+
ax_secondary.plot(index, result_df["anomaly_score"], label="Anomaly Score", color="#d62728", linewidth=1.0)
|
| 146 |
+
ax_secondary.set_ylabel("Anomaly Score")
|
| 147 |
+
|
| 148 |
+
fig.tight_layout()
|
| 149 |
+
return fig
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def build_interface() -> gr.Blocks:
|
| 153 |
+
"""Define the Gradio UI."""
|
| 154 |
+
with gr.Blocks(title="Time-RCD Zero-Shot Anomaly Detection") as demo:
|
| 155 |
+
gr.Markdown(
|
| 156 |
+
"# Time-RCD Zero-Shot Anomaly Detection\n"
|
| 157 |
+
"Upload a time series to run zero-shot anomaly detection with the pretrained Time-RCD checkpoints. "
|
| 158 |
+
"You can choose univariate or multivariate mode, adjust the window size, and configure mask settings."
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
with gr.Row():
|
| 162 |
+
file_input = gr.File(label="Upload time series file (.csv, .txt, .npy)", file_types=[".csv", ".txt", ".npy"])
|
| 163 |
+
column_selector = gr.Textbox(
|
| 164 |
+
label="Columns to use (comma-separated, optional)",
|
| 165 |
+
placeholder="e.g. value,feature_1,feature_2",
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
with gr.Row():
|
| 169 |
+
multivariate = gr.Radio(
|
| 170 |
+
choices=["Univariate", "Multivariate"],
|
| 171 |
+
value="Univariate",
|
| 172 |
+
label="Data type",
|
| 173 |
+
)
|
| 174 |
+
window_size_in = gr.Slider(
|
| 175 |
+
minimum=128,
|
| 176 |
+
maximum=8192,
|
| 177 |
+
value=2048,
|
| 178 |
+
step=128,
|
| 179 |
+
label="Window size",
|
| 180 |
+
)
|
| 181 |
+
batch_size_in = gr.Slider(
|
| 182 |
+
minimum=1,
|
| 183 |
+
maximum=128,
|
| 184 |
+
value=16,
|
| 185 |
+
step=1,
|
| 186 |
+
label="Batch size",
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
with gr.Row():
|
| 190 |
+
mask_type_in = gr.Radio(
|
| 191 |
+
choices=["random_mask", "full_mask"],
|
| 192 |
+
value="random_mask",
|
| 193 |
+
label="Mask type (multivariate only)",
|
| 194 |
+
)
|
| 195 |
+
multi_size_in = gr.Radio(
|
| 196 |
+
choices=["full", "small"],
|
| 197 |
+
value="full",
|
| 198 |
+
label="Multivariate model size",
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
run_button = gr.Button("Run Inference", variant="primary")
|
| 202 |
+
|
| 203 |
+
result_message = gr.Textbox(label="Summary", interactive=False)
|
| 204 |
+
result_dataframe = gr.DataFrame(label="Anomaly Scores", interactive=False)
|
| 205 |
+
plot_output = gr.Plot(label="Series vs. Anomaly Score")
|
| 206 |
+
|
| 207 |
+
def _submit(file_obj, multivariate_choice, win, batch, mask, size, columns_text):
|
| 208 |
+
if file_obj is None:
|
| 209 |
+
raise gr.Error("Please upload a time series file.")
|
| 210 |
+
|
| 211 |
+
feature_columns = [col.strip() for col in columns_text.split(",") if col.strip()] if columns_text else []
|
| 212 |
+
is_multi = multivariate_choice == "Multivariate"
|
| 213 |
+
summary, df, fig = infer(
|
| 214 |
+
file_obj=file_obj,
|
| 215 |
+
is_multivariate=is_multi,
|
| 216 |
+
window_size=int(win),
|
| 217 |
+
batch_size=int(batch),
|
| 218 |
+
mask_type=mask,
|
| 219 |
+
multi_size=size,
|
| 220 |
+
feature_columns=feature_columns,
|
| 221 |
+
)
|
| 222 |
+
return summary, df, fig
|
| 223 |
+
|
| 224 |
+
run_button.click(
|
| 225 |
+
fn=_submit,
|
| 226 |
+
inputs=[file_input, multivariate, window_size_in, batch_size_in, mask_type_in, multi_size_in, column_selector],
|
| 227 |
+
outputs=[result_message, result_dataframe, plot_output],
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
return demo
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
demo = build_interface()
|
| 234 |
+
|
| 235 |
+
if __name__ == "__main__":
|
| 236 |
+
demo.launch()
|
| 237 |
+
|
evaluation/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
evaluation/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
evaluation/affiliation/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
evaluation/affiliation/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (150 Bytes). View file
|
|
|
evaluation/affiliation/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (180 Bytes). View file
|
|
|
evaluation/affiliation/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (162 Bytes). View file
|
|
|
evaluation/affiliation/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (162 Bytes). View file
|
|
|
evaluation/affiliation/__pycache__/_affiliation_zone.cpython-310.pyc
ADDED
|
Binary file (4.27 kB). View file
|
|
|
evaluation/affiliation/__pycache__/_affiliation_zone.cpython-311.pyc
ADDED
|
Binary file (5.92 kB). View file
|
|
|
evaluation/affiliation/__pycache__/_affiliation_zone.cpython-38.pyc
ADDED
|
Binary file (4.31 kB). View file
|
|
|
evaluation/affiliation/__pycache__/_affiliation_zone.cpython-39.pyc
ADDED
|
Binary file (4.28 kB). View file
|
|
|
evaluation/affiliation/__pycache__/_integral_interval.cpython-310.pyc
ADDED
|
Binary file (12.2 kB). View file
|
|
|
evaluation/affiliation/__pycache__/_integral_interval.cpython-311.pyc
ADDED
|
Binary file (17.9 kB). View file
|
|
|
evaluation/affiliation/__pycache__/_integral_interval.cpython-38.pyc
ADDED
|
Binary file (12.2 kB). View file
|
|
|
evaluation/affiliation/__pycache__/_integral_interval.cpython-39.pyc
ADDED
|
Binary file (12.2 kB). View file
|
|
|
evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-310.pyc
ADDED
|
Binary file (3.99 kB). View file
|
|
|
evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-311.pyc
ADDED
|
Binary file (5.88 kB). View file
|
|
|
evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-38.pyc
ADDED
|
Binary file (4.07 kB). View file
|
|
|
evaluation/affiliation/__pycache__/_single_ground_truth_event.cpython-39.pyc
ADDED
|
Binary file (4.03 kB). View file
|
|
|
evaluation/affiliation/__pycache__/generics.cpython-310.pyc
ADDED
|
Binary file (5.93 kB). View file
|
|
|
evaluation/affiliation/__pycache__/generics.cpython-311.pyc
ADDED
|
Binary file (8.67 kB). View file
|
|
|
evaluation/affiliation/__pycache__/generics.cpython-38.pyc
ADDED
|
Binary file (6.05 kB). View file
|
|
|
evaluation/affiliation/__pycache__/generics.cpython-39.pyc
ADDED
|
Binary file (6.05 kB). View file
|
|
|
evaluation/affiliation/__pycache__/metrics.cpython-310.pyc
ADDED
|
Binary file (4.7 kB). View file
|
|
|
evaluation/affiliation/__pycache__/metrics.cpython-311.pyc
ADDED
|
Binary file (7.73 kB). View file
|
|
|
evaluation/affiliation/__pycache__/metrics.cpython-38.pyc
ADDED
|
Binary file (4.79 kB). View file
|
|
|
evaluation/affiliation/__pycache__/metrics.cpython-39.pyc
ADDED
|
Binary file (4.76 kB). View file
|
|
|
evaluation/affiliation/_affiliation_zone.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
from ._integral_interval import interval_intersection
|
| 4 |
+
|
| 5 |
+
def t_start(j, Js = [(1,2),(3,4),(5,6)], Trange = (1,10)):
|
| 6 |
+
"""
|
| 7 |
+
Helper for `E_gt_func`
|
| 8 |
+
|
| 9 |
+
:param j: index from 0 to len(Js) (included) on which to get the start
|
| 10 |
+
:param Js: ground truth events, as a list of couples
|
| 11 |
+
:param Trange: range of the series where Js is included
|
| 12 |
+
:return: generalized start such that the middle of t_start and t_stop
|
| 13 |
+
always gives the affiliation zone
|
| 14 |
+
"""
|
| 15 |
+
b = max(Trange)
|
| 16 |
+
n = len(Js)
|
| 17 |
+
if j == n:
|
| 18 |
+
return(2*b - t_stop(n-1, Js, Trange))
|
| 19 |
+
else:
|
| 20 |
+
return(Js[j][0])
|
| 21 |
+
|
| 22 |
+
def t_stop(j, Js = [(1,2),(3,4),(5,6)], Trange = (1,10)):
|
| 23 |
+
"""
|
| 24 |
+
Helper for `E_gt_func`
|
| 25 |
+
|
| 26 |
+
:param j: index from 0 to len(Js) (included) on which to get the stop
|
| 27 |
+
:param Js: ground truth events, as a list of couples
|
| 28 |
+
:param Trange: range of the series where Js is included
|
| 29 |
+
:return: generalized stop such that the middle of t_start and t_stop
|
| 30 |
+
always gives the affiliation zone
|
| 31 |
+
"""
|
| 32 |
+
if j == -1:
|
| 33 |
+
a = min(Trange)
|
| 34 |
+
return(2*a - t_start(0, Js, Trange))
|
| 35 |
+
else:
|
| 36 |
+
return(Js[j][1])
|
| 37 |
+
|
| 38 |
+
def E_gt_func(j, Js, Trange):
|
| 39 |
+
"""
|
| 40 |
+
Get the affiliation zone of element j of the ground truth
|
| 41 |
+
|
| 42 |
+
:param j: index from 0 to len(Js) (excluded) on which to get the zone
|
| 43 |
+
:param Js: ground truth events, as a list of couples
|
| 44 |
+
:param Trange: range of the series where Js is included, can
|
| 45 |
+
be (-math.inf, math.inf) for distance measures
|
| 46 |
+
:return: affiliation zone of element j of the ground truth represented
|
| 47 |
+
as a couple
|
| 48 |
+
"""
|
| 49 |
+
range_left = (t_stop(j-1, Js, Trange) + t_start(j, Js, Trange))/2
|
| 50 |
+
range_right = (t_stop(j, Js, Trange) + t_start(j+1, Js, Trange))/2
|
| 51 |
+
return((range_left, range_right))
|
| 52 |
+
|
| 53 |
+
def get_all_E_gt_func(Js, Trange):
|
| 54 |
+
"""
|
| 55 |
+
Get the affiliation partition from the ground truth point of view
|
| 56 |
+
|
| 57 |
+
:param Js: ground truth events, as a list of couples
|
| 58 |
+
:param Trange: range of the series where Js is included, can
|
| 59 |
+
be (-math.inf, math.inf) for distance measures
|
| 60 |
+
:return: affiliation partition of the events
|
| 61 |
+
"""
|
| 62 |
+
# E_gt is the limit of affiliation/attraction for each ground truth event
|
| 63 |
+
E_gt = [E_gt_func(j, Js, Trange) for j in range(len(Js))]
|
| 64 |
+
return(E_gt)
|
| 65 |
+
|
| 66 |
+
def affiliation_partition(Is = [(1,1.5),(2,5),(5,6),(8,9)], E_gt = [(1,2.5),(2.5,4.5),(4.5,10)]):
|
| 67 |
+
"""
|
| 68 |
+
Cut the events into the affiliation zones
|
| 69 |
+
The presentation given here is from the ground truth point of view,
|
| 70 |
+
but it is also used in the reversed direction in the main function.
|
| 71 |
+
|
| 72 |
+
:param Is: events as a list of couples
|
| 73 |
+
:param E_gt: range of the affiliation zones
|
| 74 |
+
:return: a list of list of intervals (each interval represented by either
|
| 75 |
+
a couple or None for empty interval). The outer list is indexed by each
|
| 76 |
+
affiliation zone of `E_gt`. The inner list is indexed by the events of `Is`.
|
| 77 |
+
"""
|
| 78 |
+
out = [None] * len(E_gt)
|
| 79 |
+
for j in range(len(E_gt)):
|
| 80 |
+
E_gt_j = E_gt[j]
|
| 81 |
+
discarded_idx_before = [I[1] < E_gt_j[0] for I in Is] # end point of predicted I is before the begin of E
|
| 82 |
+
discarded_idx_after = [I[0] > E_gt_j[1] for I in Is] # start of predicted I is after the end of E
|
| 83 |
+
kept_index = [not(a or b) for a, b in zip(discarded_idx_before, discarded_idx_after)]
|
| 84 |
+
Is_j = [x for x, y in zip(Is, kept_index)]
|
| 85 |
+
out[j] = [interval_intersection(I, E_gt[j]) for I in Is_j]
|
| 86 |
+
return(out)
|
evaluation/affiliation/_integral_interval.py
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import math
|
| 4 |
+
from .generics import _sum_wo_nan
|
| 5 |
+
"""
|
| 6 |
+
In order to shorten the length of the variables,
|
| 7 |
+
the general convention in this file is to let:
|
| 8 |
+
- I for a predicted event (start, stop),
|
| 9 |
+
- Is for a list of predicted events,
|
| 10 |
+
- J for a ground truth event,
|
| 11 |
+
- Js for a list of ground truth events.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
def interval_length(J = (1,2)):
|
| 15 |
+
"""
|
| 16 |
+
Length of an interval
|
| 17 |
+
|
| 18 |
+
:param J: couple representating the start and stop of an interval, or None
|
| 19 |
+
:return: length of the interval, and 0 for a None interval
|
| 20 |
+
"""
|
| 21 |
+
if J is None:
|
| 22 |
+
return(0)
|
| 23 |
+
return(J[1] - J[0])
|
| 24 |
+
|
| 25 |
+
def sum_interval_lengths(Is = [(1,2),(3,4),(5,6)]):
|
| 26 |
+
"""
|
| 27 |
+
Sum of length of the intervals
|
| 28 |
+
|
| 29 |
+
:param Is: list of intervals represented by starts and stops
|
| 30 |
+
:return: sum of the interval length
|
| 31 |
+
"""
|
| 32 |
+
return(sum([interval_length(I) for I in Is]))
|
| 33 |
+
|
| 34 |
+
def interval_intersection(I = (1, 3), J = (2, 4)):
|
| 35 |
+
"""
|
| 36 |
+
Intersection between two intervals I and J
|
| 37 |
+
I and J should be either empty or represent a positive interval (no point)
|
| 38 |
+
|
| 39 |
+
:param I: an interval represented by start and stop
|
| 40 |
+
:param J: a second interval of the same form
|
| 41 |
+
:return: an interval representing the start and stop of the intersection (or None if empty)
|
| 42 |
+
"""
|
| 43 |
+
if I is None:
|
| 44 |
+
return(None)
|
| 45 |
+
if J is None:
|
| 46 |
+
return(None)
|
| 47 |
+
|
| 48 |
+
I_inter_J = (max(I[0], J[0]), min(I[1], J[1]))
|
| 49 |
+
if I_inter_J[0] >= I_inter_J[1]:
|
| 50 |
+
return(None)
|
| 51 |
+
else:
|
| 52 |
+
return(I_inter_J)
|
| 53 |
+
|
| 54 |
+
def interval_subset(I = (1, 3), J = (0, 6)):
|
| 55 |
+
"""
|
| 56 |
+
Checks whether I is a subset of J
|
| 57 |
+
|
| 58 |
+
:param I: an non empty interval represented by start and stop
|
| 59 |
+
:param J: a second non empty interval of the same form
|
| 60 |
+
:return: True if I is a subset of J
|
| 61 |
+
"""
|
| 62 |
+
if (I[0] >= J[0]) and (I[1] <= J[1]):
|
| 63 |
+
return True
|
| 64 |
+
else:
|
| 65 |
+
return False
|
| 66 |
+
|
| 67 |
+
def cut_into_three_func(I, J):
|
| 68 |
+
"""
|
| 69 |
+
Cut an interval I into a partition of 3 subsets:
|
| 70 |
+
the elements before J,
|
| 71 |
+
the elements belonging to J,
|
| 72 |
+
and the elements after J
|
| 73 |
+
|
| 74 |
+
:param I: an interval represented by start and stop, or None for an empty one
|
| 75 |
+
:param J: a non empty interval
|
| 76 |
+
:return: a triplet of three intervals, each represented by either (start, stop) or None
|
| 77 |
+
"""
|
| 78 |
+
if I is None:
|
| 79 |
+
return((None, None, None))
|
| 80 |
+
|
| 81 |
+
I_inter_J = interval_intersection(I, J)
|
| 82 |
+
if I == I_inter_J:
|
| 83 |
+
I_before = None
|
| 84 |
+
I_after = None
|
| 85 |
+
elif I[1] <= J[0]:
|
| 86 |
+
I_before = I
|
| 87 |
+
I_after = None
|
| 88 |
+
elif I[0] >= J[1]:
|
| 89 |
+
I_before = None
|
| 90 |
+
I_after = I
|
| 91 |
+
elif (I[0] <= J[0]) and (I[1] >= J[1]):
|
| 92 |
+
I_before = (I[0], I_inter_J[0])
|
| 93 |
+
I_after = (I_inter_J[1], I[1])
|
| 94 |
+
elif I[0] <= J[0]:
|
| 95 |
+
I_before = (I[0], I_inter_J[0])
|
| 96 |
+
I_after = None
|
| 97 |
+
elif I[1] >= J[1]:
|
| 98 |
+
I_before = None
|
| 99 |
+
I_after = (I_inter_J[1], I[1])
|
| 100 |
+
else:
|
| 101 |
+
raise ValueError('unexpected unconsidered case')
|
| 102 |
+
return(I_before, I_inter_J, I_after)
|
| 103 |
+
|
| 104 |
+
def get_pivot_j(I, J):
|
| 105 |
+
"""
|
| 106 |
+
Get the single point of J that is the closest to I, called 'pivot' here,
|
| 107 |
+
with the requirement that I should be outside J
|
| 108 |
+
|
| 109 |
+
:param I: a non empty interval (start, stop)
|
| 110 |
+
:param J: another non empty interval, with empty intersection with I
|
| 111 |
+
:return: the element j of J that is the closest to I
|
| 112 |
+
"""
|
| 113 |
+
if interval_intersection(I, J) is not None:
|
| 114 |
+
raise ValueError('I and J should have a void intersection')
|
| 115 |
+
|
| 116 |
+
j_pivot = None # j_pivot is a border of J
|
| 117 |
+
if max(I) <= min(J):
|
| 118 |
+
j_pivot = min(J)
|
| 119 |
+
elif min(I) >= max(J):
|
| 120 |
+
j_pivot = max(J)
|
| 121 |
+
else:
|
| 122 |
+
raise ValueError('I should be outside J')
|
| 123 |
+
return(j_pivot)
|
| 124 |
+
|
| 125 |
+
def integral_mini_interval(I, J):
|
| 126 |
+
"""
|
| 127 |
+
In the specific case where interval I is located outside J,
|
| 128 |
+
integral of distance from x to J over the interval x \in I.
|
| 129 |
+
This is the *integral* i.e. the sum.
|
| 130 |
+
It's not the mean (not divided by the length of I yet)
|
| 131 |
+
|
| 132 |
+
:param I: a interval (start, stop), or None
|
| 133 |
+
:param J: a non empty interval, with empty intersection with I
|
| 134 |
+
:return: the integral of distances d(x, J) over x \in I
|
| 135 |
+
"""
|
| 136 |
+
if I is None:
|
| 137 |
+
return(0)
|
| 138 |
+
|
| 139 |
+
j_pivot = get_pivot_j(I, J)
|
| 140 |
+
a = min(I)
|
| 141 |
+
b = max(I)
|
| 142 |
+
return((b-a)*abs((j_pivot - (a+b)/2)))
|
| 143 |
+
|
| 144 |
+
def integral_interval_distance(I, J):
|
| 145 |
+
"""
|
| 146 |
+
For any non empty intervals I, J, compute the
|
| 147 |
+
integral of distance from x to J over the interval x \in I.
|
| 148 |
+
This is the *integral* i.e. the sum.
|
| 149 |
+
It's not the mean (not divided by the length of I yet)
|
| 150 |
+
The interval I can intersect J or not
|
| 151 |
+
|
| 152 |
+
:param I: a interval (start, stop), or None
|
| 153 |
+
:param J: a non empty interval
|
| 154 |
+
:return: the integral of distances d(x, J) over x \in I
|
| 155 |
+
"""
|
| 156 |
+
# I and J are single intervals (not generic sets)
|
| 157 |
+
# I is a predicted interval in the range of affiliation of J
|
| 158 |
+
|
| 159 |
+
def f(I_cut):
|
| 160 |
+
return(integral_mini_interval(I_cut, J))
|
| 161 |
+
# If I_middle is fully included into J, it is
|
| 162 |
+
# the distance to J is always 0
|
| 163 |
+
def f0(I_middle):
|
| 164 |
+
return(0)
|
| 165 |
+
|
| 166 |
+
cut_into_three = cut_into_three_func(I, J)
|
| 167 |
+
# Distance for now, not the mean:
|
| 168 |
+
# Distance left: Between cut_into_three[0] and the point min(J)
|
| 169 |
+
d_left = f(cut_into_three[0])
|
| 170 |
+
# Distance middle: Between cut_into_three[1] = I inter J, and J
|
| 171 |
+
d_middle = f0(cut_into_three[1])
|
| 172 |
+
# Distance right: Between cut_into_three[2] and the point max(J)
|
| 173 |
+
d_right = f(cut_into_three[2])
|
| 174 |
+
# It's an integral so summable
|
| 175 |
+
return(d_left + d_middle + d_right)
|
| 176 |
+
|
| 177 |
+
def integral_mini_interval_P_CDFmethod__min_piece(I, J, E):
|
| 178 |
+
"""
|
| 179 |
+
Helper of `integral_mini_interval_Pprecision_CDFmethod`
|
| 180 |
+
In the specific case where interval I is located outside J,
|
| 181 |
+
compute the integral $\int_{d_min}^{d_max} \min(m, x) dx$, with:
|
| 182 |
+
- m the smallest distance from J to E,
|
| 183 |
+
- d_min the smallest distance d(x, J) from x \in I to J
|
| 184 |
+
- d_max the largest distance d(x, J) from x \in I to J
|
| 185 |
+
|
| 186 |
+
:param I: a single predicted interval, a non empty interval (start, stop)
|
| 187 |
+
:param J: ground truth interval, a non empty interval, with empty intersection with I
|
| 188 |
+
:param E: the affiliation/influence zone for J, represented as a couple (start, stop)
|
| 189 |
+
:return: the integral $\int_{d_min}^{d_max} \min(m, x) dx$
|
| 190 |
+
"""
|
| 191 |
+
if interval_intersection(I, J) is not None:
|
| 192 |
+
raise ValueError('I and J should have a void intersection')
|
| 193 |
+
if not interval_subset(J, E):
|
| 194 |
+
raise ValueError('J should be included in E')
|
| 195 |
+
if not interval_subset(I, E):
|
| 196 |
+
raise ValueError('I should be included in E')
|
| 197 |
+
|
| 198 |
+
e_min = min(E)
|
| 199 |
+
j_min = min(J)
|
| 200 |
+
j_max = max(J)
|
| 201 |
+
e_max = max(E)
|
| 202 |
+
i_min = min(I)
|
| 203 |
+
i_max = max(I)
|
| 204 |
+
|
| 205 |
+
d_min = max(i_min - j_max, j_min - i_max)
|
| 206 |
+
d_max = max(i_max - j_max, j_min - i_min)
|
| 207 |
+
m = min(j_min - e_min, e_max - j_max)
|
| 208 |
+
A = min(d_max, m)**2 - min(d_min, m)**2
|
| 209 |
+
B = max(d_max, m) - max(d_min, m)
|
| 210 |
+
C = (1/2)*A + m*B
|
| 211 |
+
return(C)
|
| 212 |
+
|
| 213 |
+
def integral_mini_interval_Pprecision_CDFmethod(I, J, E):
|
| 214 |
+
"""
|
| 215 |
+
Integral of the probability of distances over the interval I.
|
| 216 |
+
In the specific case where interval I is located outside J,
|
| 217 |
+
compute the integral $\int_{x \in I} Fbar(dist(x,J)) dx$.
|
| 218 |
+
This is the *integral* i.e. the sum (not the mean)
|
| 219 |
+
|
| 220 |
+
:param I: a single predicted interval, a non empty interval (start, stop)
|
| 221 |
+
:param J: ground truth interval, a non empty interval, with empty intersection with I
|
| 222 |
+
:param E: the affiliation/influence zone for J, represented as a couple (start, stop)
|
| 223 |
+
:return: the integral $\int_{x \in I} Fbar(dist(x,J)) dx$
|
| 224 |
+
"""
|
| 225 |
+
integral_min_piece = integral_mini_interval_P_CDFmethod__min_piece(I, J, E)
|
| 226 |
+
|
| 227 |
+
e_min = min(E)
|
| 228 |
+
j_min = min(J)
|
| 229 |
+
j_max = max(J)
|
| 230 |
+
e_max = max(E)
|
| 231 |
+
i_min = min(I)
|
| 232 |
+
i_max = max(I)
|
| 233 |
+
d_min = max(i_min - j_max, j_min - i_max)
|
| 234 |
+
d_max = max(i_max - j_max, j_min - i_min)
|
| 235 |
+
integral_linear_piece = (1/2)*(d_max**2 - d_min**2)
|
| 236 |
+
integral_remaining_piece = (j_max - j_min)*(i_max - i_min)
|
| 237 |
+
|
| 238 |
+
DeltaI = i_max - i_min
|
| 239 |
+
DeltaE = e_max - e_min
|
| 240 |
+
|
| 241 |
+
output = DeltaI - (1/DeltaE)*(integral_min_piece + integral_linear_piece + integral_remaining_piece)
|
| 242 |
+
return(output)
|
| 243 |
+
|
| 244 |
+
def integral_interval_probaCDF_precision(I, J, E):
|
| 245 |
+
"""
|
| 246 |
+
Integral of the probability of distances over the interval I.
|
| 247 |
+
Compute the integral $\int_{x \in I} Fbar(dist(x,J)) dx$.
|
| 248 |
+
This is the *integral* i.e. the sum (not the mean)
|
| 249 |
+
|
| 250 |
+
:param I: a single (non empty) predicted interval in the zone of affiliation of J
|
| 251 |
+
:param J: ground truth interval
|
| 252 |
+
:param E: affiliation/influence zone for J
|
| 253 |
+
:return: the integral $\int_{x \in I} Fbar(dist(x,J)) dx$
|
| 254 |
+
"""
|
| 255 |
+
# I and J are single intervals (not generic sets)
|
| 256 |
+
def f(I_cut):
|
| 257 |
+
if I_cut is None:
|
| 258 |
+
return(0)
|
| 259 |
+
else:
|
| 260 |
+
return(integral_mini_interval_Pprecision_CDFmethod(I_cut, J, E))
|
| 261 |
+
|
| 262 |
+
# If I_middle is fully included into J, it is
|
| 263 |
+
# integral of 1 on the interval I_middle, so it's |I_middle|
|
| 264 |
+
def f0(I_middle):
|
| 265 |
+
if I_middle is None:
|
| 266 |
+
return(0)
|
| 267 |
+
else:
|
| 268 |
+
return(max(I_middle) - min(I_middle))
|
| 269 |
+
|
| 270 |
+
cut_into_three = cut_into_three_func(I, J)
|
| 271 |
+
# Distance for now, not the mean:
|
| 272 |
+
# Distance left: Between cut_into_three[0] and the point min(J)
|
| 273 |
+
d_left = f(cut_into_three[0])
|
| 274 |
+
# Distance middle: Between cut_into_three[1] = I inter J, and J
|
| 275 |
+
d_middle = f0(cut_into_three[1])
|
| 276 |
+
# Distance right: Between cut_into_three[2] and the point max(J)
|
| 277 |
+
d_right = f(cut_into_three[2])
|
| 278 |
+
# It's an integral so summable
|
| 279 |
+
return(d_left + d_middle + d_right)
|
| 280 |
+
|
| 281 |
+
def cut_J_based_on_mean_func(J, e_mean):
|
| 282 |
+
"""
|
| 283 |
+
Helper function for the recall.
|
| 284 |
+
Partition J into two intervals: before and after e_mean
|
| 285 |
+
(e_mean represents the center element of E the zone of affiliation)
|
| 286 |
+
|
| 287 |
+
:param J: ground truth interval
|
| 288 |
+
:param e_mean: a float number (center value of E)
|
| 289 |
+
:return: a couple partitionning J into (J_before, J_after)
|
| 290 |
+
"""
|
| 291 |
+
if J is None:
|
| 292 |
+
J_before = None
|
| 293 |
+
J_after = None
|
| 294 |
+
elif e_mean >= max(J):
|
| 295 |
+
J_before = J
|
| 296 |
+
J_after = None
|
| 297 |
+
elif e_mean <= min(J):
|
| 298 |
+
J_before = None
|
| 299 |
+
J_after = J
|
| 300 |
+
else: # e_mean is across J
|
| 301 |
+
J_before = (min(J), e_mean)
|
| 302 |
+
J_after = (e_mean, max(J))
|
| 303 |
+
|
| 304 |
+
return((J_before, J_after))
|
| 305 |
+
|
| 306 |
+
def integral_mini_interval_Precall_CDFmethod(I, J, E):
|
| 307 |
+
"""
|
| 308 |
+
Integral of the probability of distances over the interval J.
|
| 309 |
+
In the specific case where interval J is located outside I,
|
| 310 |
+
compute the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$.
|
| 311 |
+
This is the *integral* i.e. the sum (not the mean)
|
| 312 |
+
|
| 313 |
+
:param I: a single (non empty) predicted interval
|
| 314 |
+
:param J: ground truth (non empty) interval, with empty intersection with I
|
| 315 |
+
:param E: the affiliation/influence zone for J, represented as a couple (start, stop)
|
| 316 |
+
:return: the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$
|
| 317 |
+
"""
|
| 318 |
+
# The interval J should be located outside I
|
| 319 |
+
# (so it's either the left piece or the right piece w.r.t I)
|
| 320 |
+
i_pivot = get_pivot_j(J, I)
|
| 321 |
+
e_min = min(E)
|
| 322 |
+
e_max = max(E)
|
| 323 |
+
e_mean = (e_min + e_max) / 2
|
| 324 |
+
|
| 325 |
+
# If i_pivot is outside E (it's possible), then
|
| 326 |
+
# the distance is worst that any random element within E,
|
| 327 |
+
# so we set the recall to 0
|
| 328 |
+
if i_pivot <= min(E):
|
| 329 |
+
return(0)
|
| 330 |
+
elif i_pivot >= max(E):
|
| 331 |
+
return(0)
|
| 332 |
+
# Otherwise, we have at least i_pivot in E and so d < M so min(d,M)=d
|
| 333 |
+
|
| 334 |
+
cut_J_based_on_e_mean = cut_J_based_on_mean_func(J, e_mean)
|
| 335 |
+
J_before = cut_J_based_on_e_mean[0]
|
| 336 |
+
J_after = cut_J_based_on_e_mean[1]
|
| 337 |
+
|
| 338 |
+
iemin_mean = (e_min + i_pivot)/2
|
| 339 |
+
cut_Jbefore_based_on_iemin_mean = cut_J_based_on_mean_func(J_before, iemin_mean)
|
| 340 |
+
J_before_closeE = cut_Jbefore_based_on_iemin_mean[0] # before e_mean and closer to e_min than i_pivot ~ J_before_before
|
| 341 |
+
J_before_closeI = cut_Jbefore_based_on_iemin_mean[1] # before e_mean and closer to i_pivot than e_min ~ J_before_after
|
| 342 |
+
|
| 343 |
+
iemax_mean = (e_max + i_pivot)/2
|
| 344 |
+
cut_Jafter_based_on_iemax_mean = cut_J_based_on_mean_func(J_after, iemax_mean)
|
| 345 |
+
J_after_closeI = cut_Jafter_based_on_iemax_mean[0] # after e_mean and closer to i_pivot than e_max ~ J_after_before
|
| 346 |
+
J_after_closeE = cut_Jafter_based_on_iemax_mean[1] # after e_mean and closer to e_max than i_pivot ~ J_after_after
|
| 347 |
+
|
| 348 |
+
if J_before_closeE is not None:
|
| 349 |
+
j_before_before_min = min(J_before_closeE) # == min(J)
|
| 350 |
+
j_before_before_max = max(J_before_closeE)
|
| 351 |
+
else:
|
| 352 |
+
j_before_before_min = math.nan
|
| 353 |
+
j_before_before_max = math.nan
|
| 354 |
+
|
| 355 |
+
if J_before_closeI is not None:
|
| 356 |
+
j_before_after_min = min(J_before_closeI) # == j_before_before_max if existing
|
| 357 |
+
j_before_after_max = max(J_before_closeI) # == max(J_before)
|
| 358 |
+
else:
|
| 359 |
+
j_before_after_min = math.nan
|
| 360 |
+
j_before_after_max = math.nan
|
| 361 |
+
|
| 362 |
+
if J_after_closeI is not None:
|
| 363 |
+
j_after_before_min = min(J_after_closeI) # == min(J_after)
|
| 364 |
+
j_after_before_max = max(J_after_closeI)
|
| 365 |
+
else:
|
| 366 |
+
j_after_before_min = math.nan
|
| 367 |
+
j_after_before_max = math.nan
|
| 368 |
+
|
| 369 |
+
if J_after_closeE is not None:
|
| 370 |
+
j_after_after_min = min(J_after_closeE) # == j_after_before_max if existing
|
| 371 |
+
j_after_after_max = max(J_after_closeE) # == max(J)
|
| 372 |
+
else:
|
| 373 |
+
j_after_after_min = math.nan
|
| 374 |
+
j_after_after_max = math.nan
|
| 375 |
+
|
| 376 |
+
# <-- J_before_closeE --> <-- J_before_closeI --> <-- J_after_closeI --> <-- J_after_closeE -->
|
| 377 |
+
# j_bb_min j_bb_max j_ba_min j_ba_max j_ab_min j_ab_max j_aa_min j_aa_max
|
| 378 |
+
# (with `b` for before and `a` for after in the previous variable names)
|
| 379 |
+
|
| 380 |
+
# vs e_mean m = min(t-e_min, e_max-t) d=|i_pivot-t| min(d,m) \int min(d,m)dt \int d dt \int_(min(d,m)+d)dt \int_{t \in J}(min(d,m)+d)dt
|
| 381 |
+
# Case J_before_closeE & i_pivot after J before t-e_min i_pivot-t min(i_pivot-t,t-e_min) = t-e_min t^2/2-e_min*t i_pivot*t-t^2/2 t^2/2-e_min*t+i_pivot*t-t^2/2 = (i_pivot-e_min)*t (i_pivot-e_min)*tB - (i_pivot-e_min)*tA = (i_pivot-e_min)*(tB-tA)
|
| 382 |
+
# Case J_before_closeI & i_pivot after J before t-e_min i_pivot-t min(i_pivot-t,t-e_min) = i_pivot-t i_pivot*t-t^2/2 i_pivot*t-t^2/2 i_pivot*t-t^2/2+i_pivot*t-t^2/2 = 2*i_pivot*t-t^2 2*i_pivot*tB-tB^2 - 2*i_pivot*tA + tA^2 = 2*i_pivot*(tB-tA) - (tB^2 - tA^2)
|
| 383 |
+
# Case J_after_closeI & i_pivot after J after e_max-t i_pivot-t min(i_pivot-t,e_max-t) = i_pivot-t i_pivot*t-t^2/2 i_pivot*t-t^2/2 i_pivot*t-t^2/2+i_pivot*t-t^2/2 = 2*i_pivot*t-t^2 2*i_pivot*tB-tB^2 - 2*i_pivot*tA + tA^2 = 2*i_pivot*(tB-tA) - (tB^2 - tA^2)
|
| 384 |
+
# Case J_after_closeE & i_pivot after J after e_max-t i_pivot-t min(i_pivot-t,e_max-t) = e_max-t e_max*t-t^2/2 i_pivot*t-t^2/2 e_max*t-t^2/2+i_pivot*t-t^2/2 = (e_max+i_pivot)*t-t^2 (e_max+i_pivot)*tB-tB^2 - (e_max+i_pivot)*tA + tA^2 = (e_max+i_pivot)*(tB-tA) - (tB^2 - tA^2)
|
| 385 |
+
#
|
| 386 |
+
# Case J_before_closeE & i_pivot before J before t-e_min t-i_pivot min(t-i_pivot,t-e_min) = t-e_min t^2/2-e_min*t t^2/2-i_pivot*t t^2/2-e_min*t+t^2/2-i_pivot*t = t^2-(e_min+i_pivot)*t tB^2-(e_min+i_pivot)*tB - tA^2 + (e_min+i_pivot)*tA = (tB^2 - tA^2) - (e_min+i_pivot)*(tB-tA)
|
| 387 |
+
# Case J_before_closeI & i_pivot before J before t-e_min t-i_pivot min(t-i_pivot,t-e_min) = t-i_pivot t^2/2-i_pivot*t t^2/2-i_pivot*t t^2/2-i_pivot*t+t^2/2-i_pivot*t = t^2-2*i_pivot*t tB^2-2*i_pivot*tB - tA^2 + 2*i_pivot*tA = (tB^2 - tA^2) - 2*i_pivot*(tB-tA)
|
| 388 |
+
# Case J_after_closeI & i_pivot before J after e_max-t t-i_pivot min(t-i_pivot,e_max-t) = t-i_pivot t^2/2-i_pivot*t t^2/2-i_pivot*t t^2/2-i_pivot*t+t^2/2-i_pivot*t = t^2-2*i_pivot*t tB^2-2*i_pivot*tB - tA^2 + 2*i_pivot*tA = (tB^2 - tA^2) - 2*i_pivot*(tB-tA)
|
| 389 |
+
# Case J_after_closeE & i_pivot before J after e_max-t t-i_pivot min(t-i_pivot,e_max-t) = e_max-t e_max*t-t^2/2 t^2/2-i_pivot*t e_max*t-t^2/2+t^2/2-i_pivot*t = (e_max-i_pivot)*t (e_max-i_pivot)*tB - (e_max-i_pivot)*tA = (e_max-i_pivot)*(tB-tA)
|
| 390 |
+
|
| 391 |
+
if i_pivot >= max(J):
|
| 392 |
+
part1_before_closeE = (i_pivot-e_min)*(j_before_before_max - j_before_before_min) # (i_pivot-e_min)*(tB-tA) # j_before_before_max - j_before_before_min
|
| 393 |
+
part2_before_closeI = 2*i_pivot*(j_before_after_max-j_before_after_min) - (j_before_after_max**2 - j_before_after_min**2) # 2*i_pivot*(tB-tA) - (tB^2 - tA^2) # j_before_after_max - j_before_after_min
|
| 394 |
+
part3_after_closeI = 2*i_pivot*(j_after_before_max-j_after_before_min) - (j_after_before_max**2 - j_after_before_min**2) # 2*i_pivot*(tB-tA) - (tB^2 - tA^2) # j_after_before_max - j_after_before_min
|
| 395 |
+
part4_after_closeE = (e_max+i_pivot)*(j_after_after_max-j_after_after_min) - (j_after_after_max**2 - j_after_after_min**2) # (e_max+i_pivot)*(tB-tA) - (tB^2 - tA^2) # j_after_after_max - j_after_after_min
|
| 396 |
+
out_parts = [part1_before_closeE, part2_before_closeI, part3_after_closeI, part4_after_closeE]
|
| 397 |
+
elif i_pivot <= min(J):
|
| 398 |
+
part1_before_closeE = (j_before_before_max**2 - j_before_before_min**2) - (e_min+i_pivot)*(j_before_before_max-j_before_before_min) # (tB^2 - tA^2) - (e_min+i_pivot)*(tB-tA) # j_before_before_max - j_before_before_min
|
| 399 |
+
part2_before_closeI = (j_before_after_max**2 - j_before_after_min**2) - 2*i_pivot*(j_before_after_max-j_before_after_min) # (tB^2 - tA^2) - 2*i_pivot*(tB-tA) # j_before_after_max - j_before_after_min
|
| 400 |
+
part3_after_closeI = (j_after_before_max**2 - j_after_before_min**2) - 2*i_pivot*(j_after_before_max - j_after_before_min) # (tB^2 - tA^2) - 2*i_pivot*(tB-tA) # j_after_before_max - j_after_before_min
|
| 401 |
+
part4_after_closeE = (e_max-i_pivot)*(j_after_after_max - j_after_after_min) # (e_max-i_pivot)*(tB-tA) # j_after_after_max - j_after_after_min
|
| 402 |
+
out_parts = [part1_before_closeE, part2_before_closeI, part3_after_closeI, part4_after_closeE]
|
| 403 |
+
else:
|
| 404 |
+
raise ValueError('The i_pivot should be outside J')
|
| 405 |
+
|
| 406 |
+
out_integral_min_dm_plus_d = _sum_wo_nan(out_parts) # integral on all J, i.e. sum of the disjoint parts
|
| 407 |
+
|
| 408 |
+
# We have for each point t of J:
|
| 409 |
+
# \bar{F}_{t, recall}(d) = 1 - (1/|E|) * (min(d,m) + d)
|
| 410 |
+
# Since t is a single-point here, and we are in the case where i_pivot is inside E.
|
| 411 |
+
# The integral is then given by:
|
| 412 |
+
# C = \int_{t \in J} \bar{F}_{t, recall}(D(t)) dt
|
| 413 |
+
# = \int_{t \in J} 1 - (1/|E|) * (min(d,m) + d) dt
|
| 414 |
+
# = |J| - (1/|E|) * [\int_{t \in J} (min(d,m) + d) dt]
|
| 415 |
+
# = |J| - (1/|E|) * out_integral_min_dm_plus_d
|
| 416 |
+
DeltaJ = max(J) - min(J)
|
| 417 |
+
DeltaE = max(E) - min(E)
|
| 418 |
+
C = DeltaJ - (1/DeltaE) * out_integral_min_dm_plus_d
|
| 419 |
+
|
| 420 |
+
return(C)
|
| 421 |
+
|
| 422 |
+
def integral_interval_probaCDF_recall(I, J, E):
|
| 423 |
+
"""
|
| 424 |
+
Integral of the probability of distances over the interval J.
|
| 425 |
+
Compute the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$.
|
| 426 |
+
This is the *integral* i.e. the sum (not the mean)
|
| 427 |
+
|
| 428 |
+
:param I: a single (non empty) predicted interval
|
| 429 |
+
:param J: ground truth (non empty) interval
|
| 430 |
+
:param E: the affiliation/influence zone for J
|
| 431 |
+
:return: the integral $\int_{y \in J} Fbar_y(dist(y,I)) dy$
|
| 432 |
+
"""
|
| 433 |
+
# I and J are single intervals (not generic sets)
|
| 434 |
+
# E is the outside affiliation interval of J (even for recall!)
|
| 435 |
+
# (in particular J \subset E)
|
| 436 |
+
#
|
| 437 |
+
# J is the portion of the ground truth affiliated to I
|
| 438 |
+
# I is a predicted interval (can be outside E possibly since it's recall)
|
| 439 |
+
def f(J_cut):
|
| 440 |
+
if J_cut is None:
|
| 441 |
+
return(0)
|
| 442 |
+
else:
|
| 443 |
+
return integral_mini_interval_Precall_CDFmethod(I, J_cut, E)
|
| 444 |
+
|
| 445 |
+
# If J_middle is fully included into I, it is
|
| 446 |
+
# integral of 1 on the interval J_middle, so it's |J_middle|
|
| 447 |
+
def f0(J_middle):
|
| 448 |
+
if J_middle is None:
|
| 449 |
+
return(0)
|
| 450 |
+
else:
|
| 451 |
+
return(max(J_middle) - min(J_middle))
|
| 452 |
+
|
| 453 |
+
cut_into_three = cut_into_three_func(J, I) # it's J that we cut into 3, depending on the position w.r.t I
|
| 454 |
+
# since we integrate over J this time.
|
| 455 |
+
#
|
| 456 |
+
# Distance for now, not the mean:
|
| 457 |
+
# Distance left: Between cut_into_three[0] and the point min(I)
|
| 458 |
+
d_left = f(cut_into_three[0])
|
| 459 |
+
# Distance middle: Between cut_into_three[1] = J inter I, and I
|
| 460 |
+
d_middle = f0(cut_into_three[1])
|
| 461 |
+
# Distance right: Between cut_into_three[2] and the point max(I)
|
| 462 |
+
d_right = f(cut_into_three[2])
|
| 463 |
+
# It's an integral so summable
|
| 464 |
+
return(d_left + d_middle + d_right)
|
evaluation/affiliation/_single_ground_truth_event.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import math
|
| 4 |
+
from ._affiliation_zone import (
|
| 5 |
+
get_all_E_gt_func,
|
| 6 |
+
affiliation_partition)
|
| 7 |
+
from ._integral_interval import (
|
| 8 |
+
integral_interval_distance,
|
| 9 |
+
integral_interval_probaCDF_precision,
|
| 10 |
+
integral_interval_probaCDF_recall,
|
| 11 |
+
interval_length,
|
| 12 |
+
sum_interval_lengths)
|
| 13 |
+
|
| 14 |
+
def affiliation_precision_distance(Is = [(1,2),(3,4),(5,6)], J = (2,5.5)):
|
| 15 |
+
"""
|
| 16 |
+
Compute the individual average distance from Is to a single ground truth J
|
| 17 |
+
|
| 18 |
+
:param Is: list of predicted events within the affiliation zone of J
|
| 19 |
+
:param J: couple representating the start and stop of a ground truth interval
|
| 20 |
+
:return: individual average precision directed distance number
|
| 21 |
+
"""
|
| 22 |
+
if all([I is None for I in Is]): # no prediction in the current area
|
| 23 |
+
return(math.nan) # undefined
|
| 24 |
+
return(sum([integral_interval_distance(I, J) for I in Is]) / sum_interval_lengths(Is))
|
| 25 |
+
|
| 26 |
+
def affiliation_precision_proba(Is = [(1,2),(3,4),(5,6)], J = (2,5.5), E = (0,8)):
|
| 27 |
+
"""
|
| 28 |
+
Compute the individual precision probability from Is to a single ground truth J
|
| 29 |
+
|
| 30 |
+
:param Is: list of predicted events within the affiliation zone of J
|
| 31 |
+
:param J: couple representating the start and stop of a ground truth interval
|
| 32 |
+
:param E: couple representing the start and stop of the zone of affiliation of J
|
| 33 |
+
:return: individual precision probability in [0, 1], or math.nan if undefined
|
| 34 |
+
"""
|
| 35 |
+
if all([I is None for I in Is]): # no prediction in the current area
|
| 36 |
+
return(math.nan) # undefined
|
| 37 |
+
return(sum([integral_interval_probaCDF_precision(I, J, E) for I in Is]) / sum_interval_lengths(Is))
|
| 38 |
+
|
| 39 |
+
def affiliation_recall_distance(Is = [(1,2),(3,4),(5,6)], J = (2,5.5)):
|
| 40 |
+
"""
|
| 41 |
+
Compute the individual average distance from a single J to the predictions Is
|
| 42 |
+
|
| 43 |
+
:param Is: list of predicted events within the affiliation zone of J
|
| 44 |
+
:param J: couple representating the start and stop of a ground truth interval
|
| 45 |
+
:return: individual average recall directed distance number
|
| 46 |
+
"""
|
| 47 |
+
Is = [I for I in Is if I is not None] # filter possible None in Is
|
| 48 |
+
if len(Is) == 0: # there is no prediction in the current area
|
| 49 |
+
return(math.inf)
|
| 50 |
+
E_gt_recall = get_all_E_gt_func(Is, (-math.inf, math.inf)) # here from the point of view of the predictions
|
| 51 |
+
Js = affiliation_partition([J], E_gt_recall) # partition of J depending of proximity with Is
|
| 52 |
+
return(sum([integral_interval_distance(J[0], I) for I, J in zip(Is, Js)]) / interval_length(J))
|
| 53 |
+
|
| 54 |
+
def affiliation_recall_proba(Is = [(1,2),(3,4),(5,6)], J = (2,5.5), E = (0,8)):
|
| 55 |
+
"""
|
| 56 |
+
Compute the individual recall probability from a single ground truth J to Is
|
| 57 |
+
|
| 58 |
+
:param Is: list of predicted events within the affiliation zone of J
|
| 59 |
+
:param J: couple representating the start and stop of a ground truth interval
|
| 60 |
+
:param E: couple representing the start and stop of the zone of affiliation of J
|
| 61 |
+
:return: individual recall probability in [0, 1]
|
| 62 |
+
"""
|
| 63 |
+
Is = [I for I in Is if I is not None] # filter possible None in Is
|
| 64 |
+
if len(Is) == 0: # there is no prediction in the current area
|
| 65 |
+
return(0)
|
| 66 |
+
E_gt_recall = get_all_E_gt_func(Is, E) # here from the point of view of the predictions
|
| 67 |
+
Js = affiliation_partition([J], E_gt_recall) # partition of J depending of proximity with Is
|
| 68 |
+
return(sum([integral_interval_probaCDF_recall(I, J[0], E) for I, J in zip(Is, Js)]) / interval_length(J))
|
evaluation/affiliation/generics.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
from itertools import groupby
|
| 4 |
+
from operator import itemgetter
|
| 5 |
+
import math
|
| 6 |
+
import gzip
|
| 7 |
+
import glob
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
def convert_vector_to_events(vector = [0, 1, 1, 0, 0, 1, 0]):
|
| 11 |
+
"""
|
| 12 |
+
Convert a binary vector (indicating 1 for the anomalous instances)
|
| 13 |
+
to a list of events. The events are considered as durations,
|
| 14 |
+
i.e. setting 1 at index i corresponds to an anomalous interval [i, i+1).
|
| 15 |
+
|
| 16 |
+
:param vector: a list of elements belonging to {0, 1}
|
| 17 |
+
:return: a list of couples, each couple representing the start and stop of
|
| 18 |
+
each event
|
| 19 |
+
"""
|
| 20 |
+
positive_indexes = [idx for idx, val in enumerate(vector) if val > 0]
|
| 21 |
+
events = []
|
| 22 |
+
for k, g in groupby(enumerate(positive_indexes), lambda ix : ix[0] - ix[1]):
|
| 23 |
+
cur_cut = list(map(itemgetter(1), g))
|
| 24 |
+
events.append((cur_cut[0], cur_cut[-1]))
|
| 25 |
+
|
| 26 |
+
# Consistent conversion in case of range anomalies (for indexes):
|
| 27 |
+
# A positive index i is considered as the interval [i, i+1),
|
| 28 |
+
# so the last index should be moved by 1
|
| 29 |
+
events = [(x, y+1) for (x,y) in events]
|
| 30 |
+
|
| 31 |
+
return(events)
|
| 32 |
+
|
| 33 |
+
def infer_Trange(events_pred, events_gt):
|
| 34 |
+
"""
|
| 35 |
+
Given the list of events events_pred and events_gt, get the
|
| 36 |
+
smallest possible Trange corresponding to the start and stop indexes
|
| 37 |
+
of the whole series.
|
| 38 |
+
Trange will not influence the measure of distances, but will impact the
|
| 39 |
+
measures of probabilities.
|
| 40 |
+
|
| 41 |
+
:param events_pred: a list of couples corresponding to predicted events
|
| 42 |
+
:param events_gt: a list of couples corresponding to ground truth events
|
| 43 |
+
:return: a couple corresponding to the smallest range containing the events
|
| 44 |
+
"""
|
| 45 |
+
if len(events_gt) == 0:
|
| 46 |
+
raise ValueError('The gt events should contain at least one event')
|
| 47 |
+
if len(events_pred) == 0:
|
| 48 |
+
# empty prediction, base Trange only on events_gt (which is non empty)
|
| 49 |
+
return(infer_Trange(events_gt, events_gt))
|
| 50 |
+
|
| 51 |
+
min_pred = min([x[0] for x in events_pred])
|
| 52 |
+
min_gt = min([x[0] for x in events_gt])
|
| 53 |
+
max_pred = max([x[1] for x in events_pred])
|
| 54 |
+
max_gt = max([x[1] for x in events_gt])
|
| 55 |
+
Trange = (min(min_pred, min_gt), max(max_pred, max_gt))
|
| 56 |
+
return(Trange)
|
| 57 |
+
|
| 58 |
+
def has_point_anomalies(events):
|
| 59 |
+
"""
|
| 60 |
+
Checking whether events contain point anomalies, i.e.
|
| 61 |
+
events starting and stopping at the same time.
|
| 62 |
+
|
| 63 |
+
:param events: a list of couples corresponding to predicted events
|
| 64 |
+
:return: True is the events have any point anomalies, False otherwise
|
| 65 |
+
"""
|
| 66 |
+
if len(events) == 0:
|
| 67 |
+
return(False)
|
| 68 |
+
return(min([x[1] - x[0] for x in events]) == 0)
|
| 69 |
+
|
| 70 |
+
def _sum_wo_nan(vec):
|
| 71 |
+
"""
|
| 72 |
+
Sum of elements, ignoring math.isnan ones
|
| 73 |
+
|
| 74 |
+
:param vec: vector of floating numbers
|
| 75 |
+
:return: sum of the elements, ignoring math.isnan ones
|
| 76 |
+
"""
|
| 77 |
+
vec_wo_nan = [e for e in vec if not math.isnan(e)]
|
| 78 |
+
return(sum(vec_wo_nan))
|
| 79 |
+
|
| 80 |
+
def _len_wo_nan(vec):
|
| 81 |
+
"""
|
| 82 |
+
Count of elements, ignoring math.isnan ones
|
| 83 |
+
|
| 84 |
+
:param vec: vector of floating numbers
|
| 85 |
+
:return: count of the elements, ignoring math.isnan ones
|
| 86 |
+
"""
|
| 87 |
+
vec_wo_nan = [e for e in vec if not math.isnan(e)]
|
| 88 |
+
return(len(vec_wo_nan))
|
| 89 |
+
|
| 90 |
+
def read_gz_data(filename = 'data/machinetemp_groundtruth.gz'):
|
| 91 |
+
"""
|
| 92 |
+
Load a file compressed with gz, such that each line of the
|
| 93 |
+
file is either 0 (representing a normal instance) or 1 (representing)
|
| 94 |
+
an anomalous instance.
|
| 95 |
+
:param filename: file path to the gz compressed file
|
| 96 |
+
:return: list of integers with either 0 or 1
|
| 97 |
+
"""
|
| 98 |
+
with gzip.open(filename, 'rb') as f:
|
| 99 |
+
content = f.read().splitlines()
|
| 100 |
+
content = [int(x) for x in content]
|
| 101 |
+
return(content)
|
| 102 |
+
|
| 103 |
+
def read_all_as_events():
|
| 104 |
+
"""
|
| 105 |
+
Load the files contained in the folder `data/` and convert
|
| 106 |
+
to events. The length of the series is kept.
|
| 107 |
+
The convention for the file name is: `dataset_algorithm.gz`
|
| 108 |
+
:return: two dictionaries:
|
| 109 |
+
- the first containing the list of events for each dataset and algorithm,
|
| 110 |
+
- the second containing the range of the series for each dataset
|
| 111 |
+
"""
|
| 112 |
+
filepaths = glob.glob('data/*.gz')
|
| 113 |
+
datasets = dict()
|
| 114 |
+
Tranges = dict()
|
| 115 |
+
for filepath in filepaths:
|
| 116 |
+
vector = read_gz_data(filepath)
|
| 117 |
+
events = convert_vector_to_events(vector)
|
| 118 |
+
# ad hoc cut for those files
|
| 119 |
+
cut_filepath = (os.path.split(filepath)[1]).split('_')
|
| 120 |
+
data_name = cut_filepath[0]
|
| 121 |
+
algo_name = (cut_filepath[1]).split('.')[0]
|
| 122 |
+
if not data_name in datasets:
|
| 123 |
+
datasets[data_name] = dict()
|
| 124 |
+
Tranges[data_name] = (0, len(vector))
|
| 125 |
+
datasets[data_name][algo_name] = events
|
| 126 |
+
return(datasets, Tranges)
|
| 127 |
+
|
| 128 |
+
def f1_func(p, r):
|
| 129 |
+
"""
|
| 130 |
+
Compute the f1 function
|
| 131 |
+
:param p: precision numeric value
|
| 132 |
+
:param r: recall numeric value
|
| 133 |
+
:return: f1 numeric value
|
| 134 |
+
"""
|
| 135 |
+
return(2*p*r/(p+r))
|
evaluation/affiliation/metrics.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
from .generics import (
|
| 4 |
+
infer_Trange,
|
| 5 |
+
has_point_anomalies,
|
| 6 |
+
_len_wo_nan,
|
| 7 |
+
_sum_wo_nan,
|
| 8 |
+
read_all_as_events)
|
| 9 |
+
from ._affiliation_zone import (
|
| 10 |
+
get_all_E_gt_func,
|
| 11 |
+
affiliation_partition)
|
| 12 |
+
from ._single_ground_truth_event import (
|
| 13 |
+
affiliation_precision_distance,
|
| 14 |
+
affiliation_recall_distance,
|
| 15 |
+
affiliation_precision_proba,
|
| 16 |
+
affiliation_recall_proba)
|
| 17 |
+
|
| 18 |
+
def test_events(events):
|
| 19 |
+
"""
|
| 20 |
+
Verify the validity of the input events
|
| 21 |
+
:param events: list of events, each represented by a couple (start, stop)
|
| 22 |
+
:return: None. Raise an error for incorrect formed or non ordered events
|
| 23 |
+
"""
|
| 24 |
+
if type(events) is not list:
|
| 25 |
+
raise TypeError('Input `events` should be a list of couples')
|
| 26 |
+
if not all([type(x) is tuple for x in events]):
|
| 27 |
+
raise TypeError('Input `events` should be a list of tuples')
|
| 28 |
+
if not all([len(x) == 2 for x in events]):
|
| 29 |
+
raise ValueError('Input `events` should be a list of couples (start, stop)')
|
| 30 |
+
if not all([x[0] <= x[1] for x in events]):
|
| 31 |
+
raise ValueError('Input `events` should be a list of couples (start, stop) with start <= stop')
|
| 32 |
+
if not all([events[i][1] < events[i+1][0] for i in range(len(events) - 1)]):
|
| 33 |
+
raise ValueError('Couples of input `events` should be disjoint and ordered')
|
| 34 |
+
|
| 35 |
+
def pr_from_events(events_pred, events_gt, Trange):
|
| 36 |
+
"""
|
| 37 |
+
Compute the affiliation metrics including the precision/recall in [0,1],
|
| 38 |
+
along with the individual precision/recall distances and probabilities
|
| 39 |
+
|
| 40 |
+
:param events_pred: list of predicted events, each represented by a couple
|
| 41 |
+
indicating the start and the stop of the event
|
| 42 |
+
:param events_gt: list of ground truth events, each represented by a couple
|
| 43 |
+
indicating the start and the stop of the event
|
| 44 |
+
:param Trange: range of the series where events_pred and events_gt are included,
|
| 45 |
+
represented as a couple (start, stop)
|
| 46 |
+
:return: dictionary with precision, recall, and the individual metrics
|
| 47 |
+
"""
|
| 48 |
+
# testing the inputs
|
| 49 |
+
test_events(events_pred)
|
| 50 |
+
test_events(events_gt)
|
| 51 |
+
|
| 52 |
+
# other tests
|
| 53 |
+
minimal_Trange = infer_Trange(events_pred, events_gt)
|
| 54 |
+
if not Trange[0] <= minimal_Trange[0]:
|
| 55 |
+
raise ValueError('`Trange` should include all the events')
|
| 56 |
+
if not minimal_Trange[1] <= Trange[1]:
|
| 57 |
+
raise ValueError('`Trange` should include all the events')
|
| 58 |
+
|
| 59 |
+
if len(events_gt) == 0:
|
| 60 |
+
raise ValueError('Input `events_gt` should have at least one event')
|
| 61 |
+
|
| 62 |
+
if has_point_anomalies(events_pred) or has_point_anomalies(events_gt):
|
| 63 |
+
raise ValueError('Cannot manage point anomalies currently')
|
| 64 |
+
|
| 65 |
+
if Trange is None:
|
| 66 |
+
# Set as default, but Trange should be indicated if probabilities are used
|
| 67 |
+
raise ValueError('Trange should be indicated (or inferred with the `infer_Trange` function')
|
| 68 |
+
|
| 69 |
+
E_gt = get_all_E_gt_func(events_gt, Trange)
|
| 70 |
+
aff_partition = affiliation_partition(events_pred, E_gt)
|
| 71 |
+
|
| 72 |
+
# Computing precision distance
|
| 73 |
+
d_precision = [affiliation_precision_distance(Is, J) for Is, J in zip(aff_partition, events_gt)]
|
| 74 |
+
|
| 75 |
+
# Computing recall distance
|
| 76 |
+
d_recall = [affiliation_recall_distance(Is, J) for Is, J in zip(aff_partition, events_gt)]
|
| 77 |
+
|
| 78 |
+
# Computing precision
|
| 79 |
+
p_precision = [affiliation_precision_proba(Is, J, E) for Is, J, E in zip(aff_partition, events_gt, E_gt)]
|
| 80 |
+
|
| 81 |
+
# Computing recall
|
| 82 |
+
p_recall = [affiliation_recall_proba(Is, J, E) for Is, J, E in zip(aff_partition, events_gt, E_gt)]
|
| 83 |
+
|
| 84 |
+
if _len_wo_nan(p_precision) > 0:
|
| 85 |
+
p_precision_average = _sum_wo_nan(p_precision) / _len_wo_nan(p_precision)
|
| 86 |
+
else:
|
| 87 |
+
p_precision_average = p_precision[0] # math.nan
|
| 88 |
+
p_recall_average = sum(p_recall) / len(p_recall)
|
| 89 |
+
|
| 90 |
+
dict_out = dict({'Affiliation_Precision': p_precision_average,
|
| 91 |
+
'Affiliation_Recall': p_recall_average,
|
| 92 |
+
'individual_precision_probabilities': p_precision,
|
| 93 |
+
'individual_recall_probabilities': p_recall,
|
| 94 |
+
'individual_precision_distances': d_precision,
|
| 95 |
+
'individual_recall_distances': d_recall})
|
| 96 |
+
return(dict_out)
|
| 97 |
+
|
| 98 |
+
def produce_all_results():
|
| 99 |
+
"""
|
| 100 |
+
Produce the affiliation precision/recall for all files
|
| 101 |
+
contained in the `data` repository
|
| 102 |
+
:return: a dictionary indexed by data names, each containing a dictionary
|
| 103 |
+
indexed by algorithm names, each containing the results of the affiliation
|
| 104 |
+
metrics (precision, recall, individual probabilities and distances)
|
| 105 |
+
"""
|
| 106 |
+
datasets, Tranges = read_all_as_events() # read all the events in folder `data`
|
| 107 |
+
results = dict()
|
| 108 |
+
for data_name in datasets.keys():
|
| 109 |
+
results_data = dict()
|
| 110 |
+
for algo_name in datasets[data_name].keys():
|
| 111 |
+
if algo_name != 'groundtruth':
|
| 112 |
+
results_data[algo_name] = pr_from_events(datasets[data_name][algo_name],
|
| 113 |
+
datasets[data_name]['groundtruth'],
|
| 114 |
+
Tranges[data_name])
|
| 115 |
+
results[data_name] = results_data
|
| 116 |
+
return(results)
|
evaluation/basic_metrics.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evaluation/metrics.py
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import time
|
| 3 |
+
from .basic_metrics import basic_metricor, generate_curve
|
| 4 |
+
from statsmodels.tsa.stattools import acf
|
| 5 |
+
from scipy.signal import argrelextrema
|
| 6 |
+
import numpy as np
|
| 7 |
+
import multiprocessing
|
| 8 |
+
|
| 9 |
+
import multiprocessing
|
| 10 |
+
import numpy as np
|
| 11 |
+
import torch
|
| 12 |
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
| 13 |
+
from functools import partial
|
| 14 |
+
from tqdm import tqdm
|
| 15 |
+
import time
|
| 16 |
+
|
| 17 |
+
# ============== Parallelized Affiliation ==============
|
| 18 |
+
|
| 19 |
+
def _compute_auc_roc(labels, score):
|
| 20 |
+
grader = basic_metricor()
|
| 21 |
+
try:
|
| 22 |
+
return grader.metric_ROC(labels, score)
|
| 23 |
+
except Exception:
|
| 24 |
+
return 0.0
|
| 25 |
+
|
| 26 |
+
def _compute_auc_pr(labels, score):
|
| 27 |
+
grader = basic_metricor()
|
| 28 |
+
try:
|
| 29 |
+
return grader.metric_PR(labels, score)
|
| 30 |
+
except Exception:
|
| 31 |
+
return 0.0
|
| 32 |
+
|
| 33 |
+
def _compute_vus(labels, score, slidingWindow, version):
|
| 34 |
+
try:
|
| 35 |
+
_, _, _, _, _, _, VUS_ROC, VUS_PR = generate_curve(labels.astype(int), score, slidingWindow, version)
|
| 36 |
+
return VUS_ROC, VUS_PR
|
| 37 |
+
except Exception:
|
| 38 |
+
return 0.0, 0.0
|
| 39 |
+
|
| 40 |
+
def _compute_pointf1(labels, score):
|
| 41 |
+
# print("Evaluating F1 standard...")
|
| 42 |
+
grader = basic_metricor()
|
| 43 |
+
try:
|
| 44 |
+
# print("Using chunked parallel F1 computation...")
|
| 45 |
+
return grader.metric_standard_F1_chunked(
|
| 46 |
+
true_labels=labels,
|
| 47 |
+
anomaly_scores=score,
|
| 48 |
+
chunk_size=25, # Process 25 thresholds per chunk
|
| 49 |
+
num_workers=4 # Use 4 parallel workers
|
| 50 |
+
)
|
| 51 |
+
except Exception:
|
| 52 |
+
# print("F1 standard computation failed, returning zeros.")
|
| 53 |
+
return {'F1': 0.0, 'Precision': 0.0, 'Recall': 0.0}
|
| 54 |
+
|
| 55 |
+
def _compute_pointf1pa(labels, score):
|
| 56 |
+
grader = basic_metricor()
|
| 57 |
+
try:
|
| 58 |
+
return grader.metric_PointF1PA_chunked(
|
| 59 |
+
label=labels,
|
| 60 |
+
score=score,
|
| 61 |
+
chunk_size=30, # Process 30 quantiles per chunk
|
| 62 |
+
num_workers=6 # Use 6 parallel workers
|
| 63 |
+
)
|
| 64 |
+
except Exception:
|
| 65 |
+
return {'F1_PA': 0.0, 'P_PA': 0.0, 'R_PA': 0.0}
|
| 66 |
+
|
| 67 |
+
def _compute_affiliation(labels, score):
|
| 68 |
+
grader = basic_metricor()
|
| 69 |
+
try:
|
| 70 |
+
return grader.metric_Affiliation(labels, score)
|
| 71 |
+
except Exception:
|
| 72 |
+
return 0.0, 0.0, 0.0
|
| 73 |
+
|
| 74 |
+
def _compute_t_score(labels, score):
|
| 75 |
+
grader = basic_metricor()
|
| 76 |
+
try:
|
| 77 |
+
return grader.metric_F1_T(labels, score)
|
| 78 |
+
except Exception:
|
| 79 |
+
return {'F1_T': 0.0, 'P_T': 0.0, 'R_T': 0.0}
|
| 80 |
+
|
| 81 |
+
def _compute_f1_t(labels, score):
|
| 82 |
+
grader = basic_metricor()
|
| 83 |
+
try:
|
| 84 |
+
# Use non-parallel path here to avoid pickling issues inside thread workers
|
| 85 |
+
# metric_F1_T(use_parallel=False) runs in-process and returns a dict
|
| 86 |
+
return grader.metric_F1_T(labels, score, use_parallel=True)
|
| 87 |
+
except Exception:
|
| 88 |
+
# Always return a dict to keep downstream code consistent
|
| 89 |
+
return {'F1_T': 0.0, 'P_T': 0.0, 'R_T': 0.0}
|
| 90 |
+
|
| 91 |
+
def _run_task(func, args):
|
| 92 |
+
return func(*args)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def get_metrics_optimized(score, labels, slidingWindow=100, pred=None, version='opt', thre=250):
|
| 96 |
+
"""
|
| 97 |
+
Fully optimized metrics computation with proper parallelization
|
| 98 |
+
"""
|
| 99 |
+
metrics = {}
|
| 100 |
+
start_total = time.time()
|
| 101 |
+
|
| 102 |
+
# Ensure proper data types to avoid float/integer issues
|
| 103 |
+
labels = np.asarray(labels, dtype=int)
|
| 104 |
+
score = np.asarray(score, dtype=float)
|
| 105 |
+
|
| 106 |
+
# Determine optimal number of workers based on CPU count and workload
|
| 107 |
+
n_cores = multiprocessing.cpu_count()
|
| 108 |
+
|
| 109 |
+
# For threshold-iterating functions (affiliation and F1_T)
|
| 110 |
+
# Use more workers since they have heavy loops
|
| 111 |
+
heavy_workers = min(n_cores - 2, 8) # Leave some cores for system
|
| 112 |
+
|
| 113 |
+
# For simple metrics
|
| 114 |
+
light_workers = min(n_cores // 2, 8)
|
| 115 |
+
|
| 116 |
+
print(f"Using {heavy_workers} workers for heavy metrics, {light_workers} for light metrics")
|
| 117 |
+
|
| 118 |
+
# Start the heavy computations first (they take longest)
|
| 119 |
+
print("Starting heavy computations (Affiliation and F1_T)...")
|
| 120 |
+
heavy_start = time.time()
|
| 121 |
+
grader = basic_metricor()
|
| 122 |
+
with ProcessPoolExecutor(max_workers=2) as main_executor:
|
| 123 |
+
# Launch the two heaviest computations with their own internal parallelization
|
| 124 |
+
affiliation_future = main_executor.submit(
|
| 125 |
+
grader._compute_affiliation_parallel,
|
| 126 |
+
labels,
|
| 127 |
+
score,
|
| 128 |
+
num_workers=heavy_workers
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
# t_score_future = main_executor.submit(
|
| 132 |
+
# grader.metric_F1_T_fast,
|
| 133 |
+
# labels,
|
| 134 |
+
# score,
|
| 135 |
+
# num_workers=heavy_workers*2
|
| 136 |
+
# )
|
| 137 |
+
#
|
| 138 |
+
# While heavy computations are running, compute light metrics
|
| 139 |
+
print("Computing light metrics in parallel...")
|
| 140 |
+
light_start = time.time()
|
| 141 |
+
|
| 142 |
+
with ThreadPoolExecutor(max_workers=light_workers) as light_executor:
|
| 143 |
+
light_futures = {
|
| 144 |
+
'auc_roc': light_executor.submit(_compute_auc_roc, labels, score),
|
| 145 |
+
'auc_pr': light_executor.submit(_compute_auc_pr, labels, score),
|
| 146 |
+
'vus': light_executor.submit(_compute_vus, labels, score, slidingWindow, version),
|
| 147 |
+
'pointf1': light_executor.submit(_compute_pointf1, labels, score),
|
| 148 |
+
'pointf1pa': light_executor.submit(_compute_pointf1pa, labels, score),
|
| 149 |
+
'f1_t': light_executor.submit(_compute_f1_t, labels, score)
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
# Collect light metric results as they complete
|
| 153 |
+
light_results = {}
|
| 154 |
+
for name, future in light_futures.items():
|
| 155 |
+
try:
|
| 156 |
+
light_results[name] = future.result()
|
| 157 |
+
print(f" ✓ {name} completed")
|
| 158 |
+
except Exception as e:
|
| 159 |
+
print(f" ✗ {name} failed: {e}")
|
| 160 |
+
light_results[name] = None
|
| 161 |
+
|
| 162 |
+
print(f"Light metrics completed in {time.time() - light_start:.2f}s")
|
| 163 |
+
|
| 164 |
+
# Wait for heavy computations to complete
|
| 165 |
+
print("Waiting for heavy computations...")
|
| 166 |
+
|
| 167 |
+
try:
|
| 168 |
+
Affiliation_F, Affiliation_P, Affiliation_R = affiliation_future.result()
|
| 169 |
+
print(f" ✓ Affiliation completed")
|
| 170 |
+
except Exception as e:
|
| 171 |
+
print(f" ✗ Affiliation failed: {e}")
|
| 172 |
+
Affiliation_F, Affiliation_P, Affiliation_R = 0.0, 0.0, 0.0
|
| 173 |
+
|
| 174 |
+
# try:
|
| 175 |
+
# T_score = t_score_future.result()
|
| 176 |
+
# print(f" ✓ F1_T completed")
|
| 177 |
+
# except Exception as e:
|
| 178 |
+
# print(f" ✗ F1_T failed: {e}")
|
| 179 |
+
# T_score = {'F1_T': 0.0, 'P_T': 0.0, 'R_T': 0.0}
|
| 180 |
+
|
| 181 |
+
print(f"Heavy metrics completed in {time.time() - heavy_start:.2f}s")
|
| 182 |
+
|
| 183 |
+
# Unpack light results
|
| 184 |
+
AUC_ROC = light_results.get('auc_roc', 0.0)
|
| 185 |
+
AUC_PR = light_results.get('auc_pr', 0.0)
|
| 186 |
+
VUS_result = light_results.get('vus', (0.0, 0.0))
|
| 187 |
+
if isinstance(VUS_result, tuple):
|
| 188 |
+
VUS_ROC, VUS_PR = VUS_result
|
| 189 |
+
else:
|
| 190 |
+
VUS_ROC, VUS_PR = 0.0, 0.0
|
| 191 |
+
# print("HERE IS POINTF1: ")
|
| 192 |
+
# print(light_results.get('pointf1',))
|
| 193 |
+
# sys.exit()
|
| 194 |
+
PointF1 = light_results.get('pointf1', {'F1': 0.0, 'Precision': 0.0, 'Recall': 0.0})
|
| 195 |
+
PointF1PA = light_results.get('pointf1pa', {'F1_PA': 0.0, 'P_PA': 0.0, 'R_PA': 0.0})
|
| 196 |
+
T_score = light_results.get('f1_t', {'F1_T': 0.0, 'P_T': 0.0, 'R_T': 0.0})
|
| 197 |
+
# Safeguard: if upstream returned a tuple (e.g., from an older fallback), coerce to dict
|
| 198 |
+
if isinstance(T_score, tuple):
|
| 199 |
+
try:
|
| 200 |
+
T_score = {'F1_T': T_score[0], 'P_T': T_score[1], 'R_T': T_score[2]}
|
| 201 |
+
except Exception:
|
| 202 |
+
T_score = {'F1_T': 0.0, 'P_T': 0.0, 'R_T': 0.0}
|
| 203 |
+
|
| 204 |
+
# Build final metrics dictionary
|
| 205 |
+
metrics['AUC-PR'] = AUC_PR
|
| 206 |
+
metrics['AUC-ROC'] = AUC_ROC
|
| 207 |
+
metrics['VUS-PR'] = VUS_PR
|
| 208 |
+
metrics['VUS-ROC'] = VUS_ROC
|
| 209 |
+
|
| 210 |
+
metrics['Standard-F1'] = PointF1.get('F1', 0.0)
|
| 211 |
+
metrics['Standard-Precision'] = PointF1.get('Precision', 0.0)
|
| 212 |
+
metrics['Standard-Recall'] = PointF1.get('Recall', 0.0)
|
| 213 |
+
|
| 214 |
+
metrics['PA-F1'] = PointF1PA.get('F1_PA', 0.0)
|
| 215 |
+
metrics['PA-Precision'] = PointF1PA.get('P_PA', 0.0)
|
| 216 |
+
metrics['PA-Recall'] = PointF1PA.get('R_PA', 0.0)
|
| 217 |
+
|
| 218 |
+
metrics['Affiliation-F'] = Affiliation_F
|
| 219 |
+
metrics['Affiliation-P'] = Affiliation_P
|
| 220 |
+
metrics['Affiliation-R'] = Affiliation_R
|
| 221 |
+
|
| 222 |
+
metrics['F1_T'] = T_score.get('F1_T', 0.0)
|
| 223 |
+
metrics['Precision_T'] = T_score.get('P_T', 0.0)
|
| 224 |
+
metrics['Recall_T'] = T_score.get('R_T', 0.0)
|
| 225 |
+
|
| 226 |
+
print(f"\nTotal computation time: {time.time() - start_total:.2f}s")
|
| 227 |
+
|
| 228 |
+
return metrics
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def get_metrics(score, labels, slidingWindow=100, pred=None, version='opt', thre=250):
|
| 232 |
+
metrics = {}
|
| 233 |
+
|
| 234 |
+
# Ensure proper data types to avoid float/integer issues
|
| 235 |
+
labels = np.asarray(labels, dtype=int)
|
| 236 |
+
score = np.asarray(score, dtype=float)
|
| 237 |
+
|
| 238 |
+
'''
|
| 239 |
+
Threshold Independent
|
| 240 |
+
'''
|
| 241 |
+
grader = basic_metricor()
|
| 242 |
+
# AUC_ROC, Precision, Recall, PointF1, PointF1PA, Rrecall, ExistenceReward, OverlapReward, Rprecision, RF, Precision_at_k = grader.metric_new(labels, score, pred, plot_ROC=False)
|
| 243 |
+
try:
|
| 244 |
+
AUC_ROC = grader.metric_ROC(labels, score)
|
| 245 |
+
except Exception:
|
| 246 |
+
AUC_ROC = 0.0
|
| 247 |
+
try:
|
| 248 |
+
AUC_PR = grader.metric_PR(labels, score)
|
| 249 |
+
except Exception:
|
| 250 |
+
AUC_PR = 0.0
|
| 251 |
+
|
| 252 |
+
# R_AUC_ROC, R_AUC_PR, _, _, _ = grader.RangeAUC(labels=labels, score=score, window=slidingWindow, plot_ROC=True)
|
| 253 |
+
try:
|
| 254 |
+
_, _, _, _, _, _,VUS_ROC, VUS_PR = generate_curve(labels.astype(int), score, slidingWindow, version, )
|
| 255 |
+
except Exception:
|
| 256 |
+
VUS_ROC, VUS_PR = 0.0, 0.0
|
| 257 |
+
|
| 258 |
+
'''
|
| 259 |
+
Threshold Dependent
|
| 260 |
+
if pred is None --> use the oracle threshold
|
| 261 |
+
'''
|
| 262 |
+
|
| 263 |
+
PointF1 = grader.metric_standard_F1(labels, score,)
|
| 264 |
+
PointF1PA = grader.metric_PointF1PA(labels, score,)
|
| 265 |
+
# EventF1PA = grader.metric_EventF1PA(labels, score,)
|
| 266 |
+
# RF1 = grader.metric_RF1(labels, score,)
|
| 267 |
+
try:
|
| 268 |
+
Affiliation_F, Affiliation_P, Affiliation_R = grader.metric_Affiliation(labels, score)
|
| 269 |
+
except Exception:
|
| 270 |
+
Affiliation_F, Affiliation_P, Affiliation_R = 0.0, 0.0, 0.0
|
| 271 |
+
T_score = grader.metric_F1_T(labels, score)
|
| 272 |
+
|
| 273 |
+
metrics['AUC-PR'] = AUC_PR
|
| 274 |
+
metrics['AUC-ROC'] = AUC_ROC
|
| 275 |
+
metrics['VUS-PR'] = VUS_PR
|
| 276 |
+
metrics['VUS-ROC'] = VUS_ROC
|
| 277 |
+
|
| 278 |
+
metrics['Standard-F1'] = PointF1['F1']
|
| 279 |
+
metrics['Standard-Precision'] = PointF1['Precision']
|
| 280 |
+
metrics['Standard-Recall'] = PointF1['Recall']
|
| 281 |
+
metrics['PA-F1'] = PointF1PA['F1_PA']
|
| 282 |
+
metrics['PA-Precision'] = PointF1PA['P_PA']
|
| 283 |
+
metrics['PA-Recall'] = PointF1PA['R_PA']
|
| 284 |
+
# metrics['Event-based-F1'] = EventF1PA
|
| 285 |
+
# metrics['R-based-F1'] = RF1
|
| 286 |
+
metrics['Affiliation-F'] = Affiliation_F
|
| 287 |
+
metrics['Affiliation-P'] = Affiliation_P
|
| 288 |
+
metrics['Affiliation-R'] = Affiliation_R
|
| 289 |
+
|
| 290 |
+
metrics['F1_T'] = T_score['F1_T']
|
| 291 |
+
metrics['Precision_T'] = T_score['P_T']
|
| 292 |
+
metrics['Recall_T'] = T_score['R_T']
|
| 293 |
+
|
| 294 |
+
return metrics
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def get_metrics_pred(score, labels, pred, slidingWindow=100):
|
| 298 |
+
metrics = {}
|
| 299 |
+
|
| 300 |
+
# Ensure proper data types to avoid float/integer issues
|
| 301 |
+
labels = np.asarray(labels, dtype=int)
|
| 302 |
+
score = np.asarray(score, dtype=float)
|
| 303 |
+
pred = np.asarray(pred, dtype=int)
|
| 304 |
+
|
| 305 |
+
grader = basic_metricor()
|
| 306 |
+
|
| 307 |
+
PointF1 = grader.standard_F1(labels, score, preds=pred)
|
| 308 |
+
PointF1PA = grader.metric_PointF1PA(labels, score, preds=pred)
|
| 309 |
+
EventF1PA = grader.metric_EventF1PA(labels, score, preds=pred)
|
| 310 |
+
RF1 = grader.metric_RF1(labels, score, preds=pred)
|
| 311 |
+
Affiliation_F, Affiliation_P, Affiliation_R = grader.metric_Affiliation(labels, score, preds=pred)
|
| 312 |
+
VUS_R, VUS_P, VUS_F = grader.metric_VUS_pred(labels, preds=pred, windowSize=slidingWindow)
|
| 313 |
+
|
| 314 |
+
metrics['Standard-F1'] = PointF1['F1']
|
| 315 |
+
metrics['Standard-Precision'] = PointF1['Precision']
|
| 316 |
+
metrics['Standard-Recall'] = PointF1['Recall']
|
| 317 |
+
metrics['PA-F1'] = PointF1PA
|
| 318 |
+
metrics['Event-based-F1'] = EventF1PA
|
| 319 |
+
metrics['R-based-F1'] = RF1
|
| 320 |
+
metrics['Affiliation-F'] = Affiliation_F
|
| 321 |
+
metrics['Affiliation-P'] = Affiliation_P
|
| 322 |
+
metrics['Affiliation-R'] = Affiliation_R
|
| 323 |
+
|
| 324 |
+
metrics['VUS-Recall'] = VUS_R
|
| 325 |
+
metrics['VUS-Precision'] = VUS_P
|
| 326 |
+
metrics['VUS-F'] = VUS_F
|
| 327 |
+
|
| 328 |
+
return metrics
|
| 329 |
+
|
| 330 |
+
def find_length_rank(data, rank=1):
|
| 331 |
+
data = data.squeeze()
|
| 332 |
+
if len(data.shape) > 1:
|
| 333 |
+
return 0
|
| 334 |
+
if rank == 0:
|
| 335 |
+
return 1
|
| 336 |
+
data = data[: min(20000, len(data))]
|
| 337 |
+
|
| 338 |
+
base = 3
|
| 339 |
+
auto_corr = acf(data, nlags=400, fft=True)[base:]
|
| 340 |
+
|
| 341 |
+
# plot_acf(data, lags=400, fft=True)
|
| 342 |
+
# plt.xlabel('Lags')
|
| 343 |
+
# plt.ylabel('Autocorrelation')
|
| 344 |
+
# plt.title('Autocorrelation Function (ACF)')
|
| 345 |
+
# plt.savefig('/data/liuqinghua/code/ts/TSAD-AutoML/AutoAD_Solution/candidate_pool/cd_diagram/ts_acf.png')
|
| 346 |
+
|
| 347 |
+
local_max = argrelextrema(auto_corr, np.greater)[0]
|
| 348 |
+
|
| 349 |
+
# print('auto_corr: ', auto_corr)
|
| 350 |
+
# print('local_max: ', local_max)
|
| 351 |
+
|
| 352 |
+
try:
|
| 353 |
+
# max_local_max = np.argmax([auto_corr[lcm] for lcm in local_max])
|
| 354 |
+
sorted_local_max = np.argsort([auto_corr[lcm] for lcm in local_max])[::-1] # Ascending order
|
| 355 |
+
max_local_max = sorted_local_max[0] # Default
|
| 356 |
+
if rank == 1:
|
| 357 |
+
max_local_max = sorted_local_max[0]
|
| 358 |
+
if rank == 2:
|
| 359 |
+
for i in sorted_local_max[1:]:
|
| 360 |
+
if i > sorted_local_max[0]:
|
| 361 |
+
max_local_max = i
|
| 362 |
+
break
|
| 363 |
+
if rank == 3:
|
| 364 |
+
id_tmp = 1
|
| 365 |
+
for i in sorted_local_max[1:]:
|
| 366 |
+
if i > sorted_local_max[0]:
|
| 367 |
+
id_tmp = i
|
| 368 |
+
break
|
| 369 |
+
for i in sorted_local_max[id_tmp:]:
|
| 370 |
+
if i > sorted_local_max[id_tmp]:
|
| 371 |
+
max_local_max = i
|
| 372 |
+
break
|
| 373 |
+
# print('sorted_local_max: ', sorted_local_max)
|
| 374 |
+
# print('max_local_max: ', max_local_max)
|
| 375 |
+
if local_max[max_local_max] < 3 or local_max[max_local_max] > 300:
|
| 376 |
+
return 125
|
| 377 |
+
return local_max[max_local_max] + base
|
| 378 |
+
except Exception:
|
| 379 |
+
return 125
|
evaluation/visualize.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from basic_metrics import metricor
|
| 2 |
+
import matplotlib.pyplot as plt
|
| 3 |
+
import numpy as np
|
| 4 |
+
import matplotlib.patches as mpatches
|
| 5 |
+
|
| 6 |
+
def plotFig(data, label, score, slidingWindow, fileName, modelName, plotRange=None):
|
| 7 |
+
grader = metricor()
|
| 8 |
+
|
| 9 |
+
R_AUC, R_AP, R_fpr, R_tpr, R_prec = grader.RangeAUC(labels=label, score=score, window=slidingWindow, plot_ROC=True) #
|
| 10 |
+
|
| 11 |
+
L, fpr, tpr= grader.metric_new(label, score, plot_ROC=True)
|
| 12 |
+
precision, recall, AP = grader.metric_PR(label, score)
|
| 13 |
+
|
| 14 |
+
range_anomaly = grader.range_convers_new(label)
|
| 15 |
+
# print(range_anomaly)
|
| 16 |
+
|
| 17 |
+
# max_length = min(len(score),len(data), 20000)
|
| 18 |
+
max_length = len(score)
|
| 19 |
+
|
| 20 |
+
if plotRange==None:
|
| 21 |
+
plotRange = [0,max_length]
|
| 22 |
+
|
| 23 |
+
fig3 = plt.figure(figsize=(12, 10), constrained_layout=True)
|
| 24 |
+
gs = fig3.add_gridspec(3, 4)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
f3_ax1 = fig3.add_subplot(gs[0, :-1])
|
| 28 |
+
plt.tick_params(labelbottom=False)
|
| 29 |
+
|
| 30 |
+
plt.plot(data[:max_length],'k')
|
| 31 |
+
for r in range_anomaly:
|
| 32 |
+
if r[0]==r[1]:
|
| 33 |
+
plt.plot(r[0],data[r[0]],'r.')
|
| 34 |
+
else:
|
| 35 |
+
plt.plot(range(r[0],r[1]+1),data[range(r[0],r[1]+1)],'r')
|
| 36 |
+
# plt.xlim([0,max_length])
|
| 37 |
+
plt.xlim(plotRange)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# L = [auc, precision, recall, f, Rrecall, ExistenceReward,
|
| 41 |
+
# OverlapReward, Rprecision, Rf, precision_at_k]
|
| 42 |
+
f3_ax2 = fig3.add_subplot(gs[1, :-1])
|
| 43 |
+
# plt.tick_params(labelbottom=False)
|
| 44 |
+
L1 = [ '%.2f' % elem for elem in L]
|
| 45 |
+
plt.plot(score[:max_length])
|
| 46 |
+
plt.hlines(np.mean(score)+3*np.std(score),0,max_length,linestyles='--',color='red')
|
| 47 |
+
plt.ylabel('score')
|
| 48 |
+
# plt.xlim([0,max_length])
|
| 49 |
+
plt.xlim(plotRange)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
#plot the data
|
| 53 |
+
f3_ax3 = fig3.add_subplot(gs[2, :-1])
|
| 54 |
+
index = ( label + 2*(score > (np.mean(score)+3*np.std(score))))
|
| 55 |
+
cf = lambda x: 'k' if x==0 else ('r' if x == 1 else ('g' if x == 2 else 'b') )
|
| 56 |
+
cf = np.vectorize(cf)
|
| 57 |
+
|
| 58 |
+
color = cf(index[:max_length])
|
| 59 |
+
black_patch = mpatches.Patch(color = 'black', label = 'TN')
|
| 60 |
+
red_patch = mpatches.Patch(color = 'red', label = 'FN')
|
| 61 |
+
green_patch = mpatches.Patch(color = 'green', label = 'FP')
|
| 62 |
+
blue_patch = mpatches.Patch(color = 'blue', label = 'TP')
|
| 63 |
+
plt.scatter(np.arange(max_length), data[:max_length], c=color, marker='.')
|
| 64 |
+
plt.legend(handles = [black_patch, red_patch, green_patch, blue_patch], loc= 'best')
|
| 65 |
+
# plt.xlim([0,max_length])
|
| 66 |
+
plt.xlim(plotRange)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
f3_ax4 = fig3.add_subplot(gs[0, -1])
|
| 70 |
+
plt.plot(fpr, tpr)
|
| 71 |
+
# plt.plot(R_fpr,R_tpr)
|
| 72 |
+
# plt.title('R_AUC='+str(round(R_AUC,3)))
|
| 73 |
+
plt.xlabel('FPR')
|
| 74 |
+
plt.ylabel('TPR')
|
| 75 |
+
# plt.legend(['ROC','Range-ROC'])
|
| 76 |
+
|
| 77 |
+
# f3_ax5 = fig3.add_subplot(gs[1, -1])
|
| 78 |
+
# plt.plot(recall, precision)
|
| 79 |
+
# plt.plot(R_tpr[:-1],R_prec) # I add (1,1) to (TPR, FPR) at the end !!!
|
| 80 |
+
# plt.xlabel('Recall')
|
| 81 |
+
# plt.ylabel('Precision')
|
| 82 |
+
# plt.legend(['PR','Range-PR'])
|
| 83 |
+
|
| 84 |
+
# print('AUC=', L1[0])
|
| 85 |
+
# print('F=', L1[3])
|
| 86 |
+
|
| 87 |
+
plt.suptitle(fileName + ' window='+str(slidingWindow) +' '+ modelName
|
| 88 |
+
+'\nAUC='+L1[0]+' R_AUC='+str(round(R_AUC,2))+' Precision='+L1[1]+ ' Recall='+L1[2]+' F='+L1[3]
|
| 89 |
+
+ ' ExistenceReward='+L1[5]+' OverlapReward='+L1[6]
|
| 90 |
+
+'\nAP='+str(round(AP,2))+' R_AP='+str(round(R_AP,2))+' Precision@k='+L1[9]+' Rprecision='+L1[7] + ' Rrecall='+L1[4] +' Rf='+L1[8]
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
def printResult(data, label, score, slidingWindow, fileName, modelName):
|
| 94 |
+
grader = metricor()
|
| 95 |
+
R_AUC = grader.RangeAUC(labels=label, score=score, window=slidingWindow, plot_ROC=False) #
|
| 96 |
+
L= grader.metric_new(label, score, plot_ROC=False)
|
| 97 |
+
L.append(R_AUC)
|
| 98 |
+
return L
|
| 99 |
+
|
model_wrapper.py
ADDED
|
@@ -0,0 +1,532 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import math
|
| 3 |
+
from utils.slidingWindows import find_length_rank
|
| 4 |
+
|
| 5 |
+
Unsupervise_AD_Pool = ['FFT', 'SR', 'NORMA', 'Series2Graph', 'Sub_IForest', 'IForest', 'LOF', 'Sub_LOF', 'POLY', 'MatrixProfile', 'Sub_PCA', 'PCA', 'HBOS',
|
| 6 |
+
'Sub_HBOS', 'KNN', 'Sub_KNN','KMeansAD', 'KMeansAD_U', 'KShapeAD', 'COPOD', 'CBLOF', 'COF', 'EIF', 'RobustPCA', 'Lag_Llama',
|
| 7 |
+
'TimesFM', 'Chronos', 'MOMENT_ZS', 'DADA', 'Time_MOE', 'Time_RCD', 'TSPulse']
|
| 8 |
+
Semisupervise_AD_Pool = ['Left_STAMPi', 'SAND', 'MCD', 'Sub_MCD', 'OCSVM', 'Sub_OCSVM', 'AutoEncoder', 'CNN', 'LSTMAD', 'TranAD', 'USAD', 'OmniAnomaly',
|
| 9 |
+
'AnomalyTransformer', 'TimesNet', 'FITS', 'Donut', 'OFA', 'MOMENT_FT', 'M2N2', ]
|
| 10 |
+
|
| 11 |
+
def run_Unsupervise_AD(model_name, training_data, testing_data, **kwargs):
|
| 12 |
+
# Extract data_index if present, but don't pass it to all functions
|
| 13 |
+
data_index = kwargs.pop('data_index', None)
|
| 14 |
+
|
| 15 |
+
function_name = f'run_{model_name}'
|
| 16 |
+
function_to_call = globals()[function_name]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# Only pass data_index to functions that need it
|
| 20 |
+
if 'Reconstruction' in model_name:
|
| 21 |
+
results = function_to_call(data, data_index, **kwargs)
|
| 22 |
+
else:
|
| 23 |
+
results = function_to_call(testing_data, **kwargs)
|
| 24 |
+
|
| 25 |
+
return results
|
| 26 |
+
|
| 27 |
+
def run_Semisupervise_AD(model_name, data_train, data_test, **kwargs):
|
| 28 |
+
try:
|
| 29 |
+
function_name = f'run_{model_name}'
|
| 30 |
+
function_to_call = globals()[function_name]
|
| 31 |
+
results = function_to_call(data_train, data_test, **kwargs)
|
| 32 |
+
return results
|
| 33 |
+
except KeyError:
|
| 34 |
+
error_message = f"Model function '{function_name}' is not defined."
|
| 35 |
+
print(error_message)
|
| 36 |
+
return error_message
|
| 37 |
+
except Exception as e:
|
| 38 |
+
error_message = f"An error occurred while running the model '{function_name}': {str(e)}"
|
| 39 |
+
print(error_message)
|
| 40 |
+
return error_message
|
| 41 |
+
|
| 42 |
+
def run_FFT(data, ifft_parameters=5, local_neighbor_window=21, local_outlier_threshold=0.6, max_region_size=50, max_sign_change_distance=10):
|
| 43 |
+
from models.FFT import FFT
|
| 44 |
+
clf = FFT(ifft_parameters=ifft_parameters, local_neighbor_window=local_neighbor_window, local_outlier_threshold=local_outlier_threshold, max_region_size=max_region_size, max_sign_change_distance=max_sign_change_distance)
|
| 45 |
+
clf.fit(data)
|
| 46 |
+
score = clf.decision_scores_
|
| 47 |
+
return score.ravel()
|
| 48 |
+
|
| 49 |
+
def run_Sub_IForest(data, periodicity=1, n_estimators=100, max_features=1, n_jobs=1):
|
| 50 |
+
from models.IForest import IForest
|
| 51 |
+
slidingWindow = find_length_rank(data, rank=periodicity)
|
| 52 |
+
clf = IForest(slidingWindow=slidingWindow, n_estimators=n_estimators, max_features=max_features, n_jobs=n_jobs)
|
| 53 |
+
clf.fit(data)
|
| 54 |
+
score = clf.decision_scores_
|
| 55 |
+
return score.ravel()
|
| 56 |
+
|
| 57 |
+
def run_IForest(train_data, test_data, slidingWindow=100, n_estimators=100, max_features=1, n_jobs=1):
|
| 58 |
+
from models.IForest import IForest
|
| 59 |
+
clf = IForest(slidingWindow=slidingWindow, n_estimators=n_estimators, max_features=max_features, n_jobs=n_jobs)
|
| 60 |
+
clf.fit(train_data)
|
| 61 |
+
score = clf.decision_function(test_data)
|
| 62 |
+
# score = clf.decision_scores_
|
| 63 |
+
return score.ravel()
|
| 64 |
+
|
| 65 |
+
def run_Sub_LOF(data, periodicity=1, n_neighbors=30, metric='minkowski', n_jobs=1):
|
| 66 |
+
from models.LOF import LOF
|
| 67 |
+
slidingWindow = find_length_rank(data, rank=periodicity)
|
| 68 |
+
clf = LOF(slidingWindow=slidingWindow, n_neighbors=n_neighbors, metric=metric, n_jobs=n_jobs)
|
| 69 |
+
clf.fit(data)
|
| 70 |
+
score = clf.decision_scores_
|
| 71 |
+
return score.ravel()
|
| 72 |
+
|
| 73 |
+
def run_LOF(train_data, test_data, slidingWindow=1, n_neighbors=30, metric='minkowski', n_jobs=1):
|
| 74 |
+
from models.LOF import LOF
|
| 75 |
+
clf = LOF(slidingWindow=slidingWindow, n_neighbors=n_neighbors, metric=metric, n_jobs=n_jobs)
|
| 76 |
+
clf.fit(train_data)
|
| 77 |
+
score = clf.decision_function(test_data)
|
| 78 |
+
return score.ravel()
|
| 79 |
+
|
| 80 |
+
def run_POLY(data, periodicity=1, power=3, n_jobs=1):
|
| 81 |
+
from models.POLY import POLY
|
| 82 |
+
slidingWindow = find_length_rank(data, rank=periodicity)
|
| 83 |
+
clf = POLY(power=power, window = slidingWindow)
|
| 84 |
+
clf.fit(data)
|
| 85 |
+
score = clf.decision_scores_
|
| 86 |
+
return score.ravel()
|
| 87 |
+
|
| 88 |
+
def run_MatrixProfile(data, periodicity=1, n_jobs=1):
|
| 89 |
+
from models.MatrixProfile import MatrixProfile
|
| 90 |
+
slidingWindow = find_length_rank(data, rank=periodicity)
|
| 91 |
+
clf = MatrixProfile(window=slidingWindow)
|
| 92 |
+
clf.fit(data)
|
| 93 |
+
score = clf.decision_scores_
|
| 94 |
+
return score.ravel()
|
| 95 |
+
|
| 96 |
+
def run_Left_STAMPi(data_train, data):
|
| 97 |
+
from models.Left_STAMPi import Left_STAMPi
|
| 98 |
+
clf = Left_STAMPi(n_init_train=len(data_train), window_size=100)
|
| 99 |
+
clf.fit(data)
|
| 100 |
+
score = clf.decision_function(data)
|
| 101 |
+
return score.ravel()
|
| 102 |
+
|
| 103 |
+
def run_SAND(data_train, data_test, periodicity=1):
|
| 104 |
+
from models.SAND import SAND
|
| 105 |
+
slidingWindow = find_length_rank(data_test, rank=periodicity)
|
| 106 |
+
clf = SAND(pattern_length=slidingWindow, subsequence_length=4*(slidingWindow))
|
| 107 |
+
clf.fit(data_test.squeeze(), online=True, overlaping_rate=int(1.5*slidingWindow), init_length=len(data_train), alpha=0.5, batch_size=max(5*(slidingWindow), int(0.1*len(data_test))))
|
| 108 |
+
score = clf.decision_scores_
|
| 109 |
+
return score.ravel()
|
| 110 |
+
|
| 111 |
+
def run_KShapeAD(data, periodicity=1):
|
| 112 |
+
from models.SAND import SAND
|
| 113 |
+
slidingWindow = find_length_rank(data, rank=periodicity)
|
| 114 |
+
clf = SAND(pattern_length=slidingWindow, subsequence_length=4*(slidingWindow))
|
| 115 |
+
clf.fit(data.squeeze(), overlaping_rate=int(1.5*slidingWindow))
|
| 116 |
+
score = clf.decision_scores_
|
| 117 |
+
return score.ravel()
|
| 118 |
+
|
| 119 |
+
def run_Series2Graph(data, periodicity=1):
|
| 120 |
+
from models.Series2Graph import Series2Graph
|
| 121 |
+
slidingWindow = find_length_rank(data, rank=periodicity)
|
| 122 |
+
|
| 123 |
+
data = data.squeeze()
|
| 124 |
+
s2g = Series2Graph(pattern_length=slidingWindow)
|
| 125 |
+
s2g.fit(data)
|
| 126 |
+
query_length = 2*slidingWindow
|
| 127 |
+
s2g.score(query_length=query_length,dataset=data)
|
| 128 |
+
|
| 129 |
+
score = s2g.decision_scores_
|
| 130 |
+
score = np.array([score[0]]*math.ceil(query_length//2) + list(score) + [score[-1]]*(query_length//2))
|
| 131 |
+
return score.ravel()
|
| 132 |
+
|
| 133 |
+
def run_Sub_PCA(train_data, test_data, periodicity=1, n_components=None, n_jobs=1):
|
| 134 |
+
from models.PCA import PCA
|
| 135 |
+
slidingWindow = find_length_rank(train_data, rank=periodicity)
|
| 136 |
+
clf = PCA(slidingWindow = slidingWindow, n_components=n_components)
|
| 137 |
+
clf.fit(train_data)
|
| 138 |
+
score = clf.decision_function(test_data)
|
| 139 |
+
return score.ravel()
|
| 140 |
+
|
| 141 |
+
def run_PCA(train_data, test_data, slidingWindow=100, n_components=None, n_jobs=1):
|
| 142 |
+
from models.PCA import PCA
|
| 143 |
+
clf = PCA(slidingWindow = slidingWindow, n_components=n_components)
|
| 144 |
+
clf.fit(train_data)
|
| 145 |
+
score = clf.decision_function(test_data)
|
| 146 |
+
return score.ravel()
|
| 147 |
+
|
| 148 |
+
def run_NORMA(data, periodicity=1, clustering='hierarchical', n_jobs=1):
|
| 149 |
+
from models.NormA import NORMA
|
| 150 |
+
slidingWindow = find_length_rank(data, rank=periodicity)
|
| 151 |
+
clf = NORMA(pattern_length=slidingWindow, nm_size=3*slidingWindow, clustering=clustering)
|
| 152 |
+
clf.fit(data)
|
| 153 |
+
score = clf.decision_scores_
|
| 154 |
+
score = np.array([score[0]]*math.ceil((slidingWindow-1)/2) + list(score) + [score[-1]]*((slidingWindow-1)//2))
|
| 155 |
+
if len(score) > len(data):
|
| 156 |
+
start = len(score) - len(data)
|
| 157 |
+
score = score[start:]
|
| 158 |
+
return score.ravel()
|
| 159 |
+
|
| 160 |
+
def run_Sub_HBOS(data, periodicity=1, n_bins=10, tol=0.5, n_jobs=1):
|
| 161 |
+
from models.HBOS import HBOS
|
| 162 |
+
slidingWindow = find_length_rank(data, rank=periodicity)
|
| 163 |
+
clf = HBOS(slidingWindow=slidingWindow, n_bins=n_bins, tol=tol)
|
| 164 |
+
clf.fit(data)
|
| 165 |
+
score = clf.decision_scores_
|
| 166 |
+
return score.ravel()
|
| 167 |
+
|
| 168 |
+
def run_HBOS(data, slidingWindow=1, n_bins=10, tol=0.5, n_jobs=1):
|
| 169 |
+
from models.HBOS import HBOS
|
| 170 |
+
clf = HBOS(slidingWindow=slidingWindow, n_bins=n_bins, tol=tol)
|
| 171 |
+
clf.fit(data)
|
| 172 |
+
score = clf.decision_scores_
|
| 173 |
+
return score.ravel()
|
| 174 |
+
|
| 175 |
+
def run_Sub_OCSVM(data_train, data_test, kernel='rbf', nu=0.5, periodicity=1, n_jobs=1):
|
| 176 |
+
from models.OCSVM import OCSVM
|
| 177 |
+
slidingWindow = find_length_rank(data_test, rank=periodicity)
|
| 178 |
+
clf = OCSVM(slidingWindow=slidingWindow, kernel=kernel, nu=nu)
|
| 179 |
+
clf.fit(data_train)
|
| 180 |
+
score = clf.decision_function(data_test)
|
| 181 |
+
return score.ravel()
|
| 182 |
+
|
| 183 |
+
def run_OCSVM(data_train, data_test, kernel='rbf', nu=0.5, slidingWindow=1, n_jobs=1):
|
| 184 |
+
from models.OCSVM import OCSVM
|
| 185 |
+
clf = OCSVM(slidingWindow=slidingWindow, kernel=kernel, nu=nu)
|
| 186 |
+
clf.fit(data_train)
|
| 187 |
+
score = clf.decision_function(data_test)
|
| 188 |
+
return score.ravel()
|
| 189 |
+
|
| 190 |
+
def run_Sub_MCD(data_train, data_test, support_fraction=None, periodicity=1, n_jobs=1):
|
| 191 |
+
from models.MCD import MCD
|
| 192 |
+
slidingWindow = find_length_rank(data_test, rank=periodicity)
|
| 193 |
+
clf = MCD(slidingWindow=slidingWindow, support_fraction=support_fraction)
|
| 194 |
+
clf.fit(data_train)
|
| 195 |
+
score = clf.decision_function(data_test)
|
| 196 |
+
return score.ravel()
|
| 197 |
+
|
| 198 |
+
def run_MCD(data_train, data_test, support_fraction=None, slidingWindow=1, n_jobs=1):
|
| 199 |
+
from models.MCD import MCD
|
| 200 |
+
clf = MCD(slidingWindow=slidingWindow, support_fraction=support_fraction)
|
| 201 |
+
clf.fit(data_train)
|
| 202 |
+
score = clf.decision_function(data_test)
|
| 203 |
+
return score.ravel()
|
| 204 |
+
|
| 205 |
+
def run_Sub_KNN(data, n_neighbors=10, method='largest', periodicity=1, n_jobs=1):
|
| 206 |
+
from models.KNN import KNN
|
| 207 |
+
slidingWindow = find_length_rank(data, rank=periodicity)
|
| 208 |
+
clf = KNN(slidingWindow=slidingWindow, n_neighbors=n_neighbors,method=method, n_jobs=n_jobs)
|
| 209 |
+
clf.fit(data)
|
| 210 |
+
score = clf.decision_scores_
|
| 211 |
+
return score.ravel()
|
| 212 |
+
|
| 213 |
+
def run_KNN(data, slidingWindow=1, n_neighbors=10, method='largest', n_jobs=1):
|
| 214 |
+
from models.KNN import KNN
|
| 215 |
+
clf = KNN(slidingWindow=slidingWindow, n_neighbors=n_neighbors, method=method, n_jobs=n_jobs)
|
| 216 |
+
clf.fit(data)
|
| 217 |
+
score = clf.decision_scores_
|
| 218 |
+
return score.ravel()
|
| 219 |
+
|
| 220 |
+
def run_KMeansAD(data, n_clusters=20, window_size=20, n_jobs=1):
|
| 221 |
+
from models.KMeansAD import KMeansAD
|
| 222 |
+
clf = KMeansAD(k=n_clusters, window_size=window_size, stride=1, n_jobs=n_jobs)
|
| 223 |
+
score = clf.fit_predict(data)
|
| 224 |
+
return score.ravel()
|
| 225 |
+
|
| 226 |
+
def run_KMeansAD_U(data, n_clusters=20, periodicity=1,n_jobs=1):
|
| 227 |
+
from models.KMeansAD import KMeansAD
|
| 228 |
+
slidingWindow = find_length_rank(data, rank=periodicity)
|
| 229 |
+
clf = KMeansAD(k=n_clusters, window_size=slidingWindow, stride=1, n_jobs=n_jobs)
|
| 230 |
+
score = clf.fit_predict(data)
|
| 231 |
+
return score.ravel()
|
| 232 |
+
|
| 233 |
+
def run_COPOD(data, n_jobs=1):
|
| 234 |
+
from models.COPOD import COPOD
|
| 235 |
+
clf = COPOD(n_jobs=n_jobs)
|
| 236 |
+
clf.fit(data)
|
| 237 |
+
score = clf.decision_scores_
|
| 238 |
+
return score.ravel()
|
| 239 |
+
|
| 240 |
+
def run_CBLOF(data, n_clusters=8, alpha=0.9, n_jobs=1):
|
| 241 |
+
from models.CBLOF import CBLOF
|
| 242 |
+
clf = CBLOF(n_clusters=n_clusters, alpha=alpha, n_jobs=n_jobs)
|
| 243 |
+
clf.fit(data)
|
| 244 |
+
score = clf.decision_scores_
|
| 245 |
+
return score.ravel()
|
| 246 |
+
|
| 247 |
+
def run_COF(data, n_neighbors=30):
|
| 248 |
+
from models.COF import COF
|
| 249 |
+
clf = COF(n_neighbors=n_neighbors)
|
| 250 |
+
clf.fit(data)
|
| 251 |
+
score = clf.decision_scores_
|
| 252 |
+
return score.ravel()
|
| 253 |
+
|
| 254 |
+
def run_EIF(data, n_trees=100):
|
| 255 |
+
from models.EIF import EIF
|
| 256 |
+
clf = EIF(n_trees=n_trees)
|
| 257 |
+
clf.fit(data)
|
| 258 |
+
score = clf.decision_scores_
|
| 259 |
+
return score.ravel()
|
| 260 |
+
|
| 261 |
+
def run_RobustPCA(data, max_iter=1000):
|
| 262 |
+
from models.RobustPCA import RobustPCA
|
| 263 |
+
clf = RobustPCA(max_iter=max_iter)
|
| 264 |
+
clf.fit(data)
|
| 265 |
+
score = clf.decision_scores_
|
| 266 |
+
return score.ravel()
|
| 267 |
+
|
| 268 |
+
def run_SR(data, periodicity=1):
|
| 269 |
+
from models.SR import SR
|
| 270 |
+
slidingWindow = find_length_rank(data, rank=periodicity)
|
| 271 |
+
return SR(data, window_size=slidingWindow)
|
| 272 |
+
|
| 273 |
+
def run_AutoEncoder(data_train, data_test, window_size=100, hidden_neurons=[64, 32], n_jobs=1):
|
| 274 |
+
from models.AE import AutoEncoder
|
| 275 |
+
clf = AutoEncoder(slidingWindow=window_size, hidden_neurons=hidden_neurons, batch_size=128, epochs=50)
|
| 276 |
+
clf.fit(data_train)
|
| 277 |
+
score = clf.decision_function(data_test)
|
| 278 |
+
return score.ravel()
|
| 279 |
+
|
| 280 |
+
def run_CNN(data_train, data_test, window_size=100, num_channel=[32, 32, 40], lr=0.0008, n_jobs=1):
|
| 281 |
+
from models.CNN import CNN
|
| 282 |
+
clf = CNN(window_size=window_size, num_channel=num_channel, feats=data_test.shape[1], lr=lr, batch_size=128)
|
| 283 |
+
clf.fit(data_train)
|
| 284 |
+
score = clf.decision_function(data_test)
|
| 285 |
+
return score.ravel()
|
| 286 |
+
|
| 287 |
+
def run_LSTMAD(data_train, data_test, window_size=100, lr=0.0008):
|
| 288 |
+
from models.LSTMAD import LSTMAD
|
| 289 |
+
clf = LSTMAD(window_size=window_size, pred_len=1, lr=lr, feats=data_test.shape[1], batch_size=128)
|
| 290 |
+
clf.fit(data_train)
|
| 291 |
+
score = clf.decision_function(data_test)
|
| 292 |
+
return score.ravel()
|
| 293 |
+
|
| 294 |
+
def run_TranAD(data_train, data_test, win_size=10, lr=1e-3):
|
| 295 |
+
from models.TranAD import TranAD
|
| 296 |
+
clf = TranAD(win_size=win_size, feats=data_test.shape[1], lr=lr)
|
| 297 |
+
clf.fit(data_train)
|
| 298 |
+
score = clf.decision_function(data_test)
|
| 299 |
+
return score.ravel()
|
| 300 |
+
|
| 301 |
+
def run_AnomalyTransformer(data_train, data_test, win_size=100, lr=1e-4, batch_size=128):
|
| 302 |
+
from models.AnomalyTransformer import AnomalyTransformer
|
| 303 |
+
clf = AnomalyTransformer(win_size=win_size, input_c=data_test.shape[1], lr=lr, batch_size=batch_size)
|
| 304 |
+
clf.fit(data_train)
|
| 305 |
+
score = clf.decision_function(data_test)
|
| 306 |
+
return score.ravel()
|
| 307 |
+
|
| 308 |
+
def run_OmniAnomaly(data_train, data_test, win_size=100, lr=0.002):
|
| 309 |
+
from models.OmniAnomaly import OmniAnomaly
|
| 310 |
+
clf = OmniAnomaly(win_size=win_size, feats=data_test.shape[1], lr=lr)
|
| 311 |
+
clf.fit(data_train)
|
| 312 |
+
score = clf.decision_function(data_test)
|
| 313 |
+
return score.ravel()
|
| 314 |
+
|
| 315 |
+
def run_USAD(data_train, data_test, win_size=5, lr=1e-4):
|
| 316 |
+
from models.USAD import USAD
|
| 317 |
+
clf = USAD(win_size=win_size, feats=data_test.shape[1], lr=lr)
|
| 318 |
+
clf.fit(data_train)
|
| 319 |
+
score = clf.decision_function(data_test)
|
| 320 |
+
return score.ravel()
|
| 321 |
+
|
| 322 |
+
def run_Donut(data_train, data_test, win_size=120, lr=1e-4, batch_size=128):
|
| 323 |
+
from models.Donut import Donut
|
| 324 |
+
clf = Donut(win_size=win_size, input_c=data_test.shape[1], lr=lr, batch_size=batch_size)
|
| 325 |
+
clf.fit(data_train)
|
| 326 |
+
score = clf.decision_function(data_test)
|
| 327 |
+
return score.ravel()
|
| 328 |
+
|
| 329 |
+
def run_TimesNet(data_train, data_test, win_size=96, lr=1e-4):
|
| 330 |
+
from models.TimesNet import TimesNet
|
| 331 |
+
clf = TimesNet(win_size=win_size, enc_in=data_test.shape[1], lr=lr, epochs=50)
|
| 332 |
+
clf.fit(data_train)
|
| 333 |
+
score = clf.decision_function(data_test)
|
| 334 |
+
return score.ravel()
|
| 335 |
+
|
| 336 |
+
def run_FITS(data_train, data_test, win_size=100, lr=1e-3):
|
| 337 |
+
from models.FITS import FITS
|
| 338 |
+
clf = FITS(win_size=win_size, input_c=data_test.shape[1], lr=lr, batch_size=128)
|
| 339 |
+
clf.fit(data_train)
|
| 340 |
+
score = clf.decision_function(data_test)
|
| 341 |
+
return score.ravel()
|
| 342 |
+
|
| 343 |
+
def run_OFA(data_train, data_test, win_size=100, batch_size = 64):
|
| 344 |
+
from models.OFA import OFA
|
| 345 |
+
clf = OFA(win_size=win_size, enc_in=data_test.shape[1], epochs=10, batch_size=batch_size)
|
| 346 |
+
clf.fit(data_train)
|
| 347 |
+
score = clf.decision_function(data_test)
|
| 348 |
+
return score.ravel()
|
| 349 |
+
|
| 350 |
+
def run_Lag_Llama(data, win_size=96, batch_size=64):
|
| 351 |
+
from models.Lag_Llama import Lag_Llama
|
| 352 |
+
clf = Lag_Llama(win_size=win_size, input_c=data.shape[1], batch_size=batch_size)
|
| 353 |
+
clf.fit(data)
|
| 354 |
+
score = clf.decision_scores_
|
| 355 |
+
return score.ravel()
|
| 356 |
+
|
| 357 |
+
def run_Chronos(data, win_size=50, batch_size=64):
|
| 358 |
+
from models.Chronos import Chronos
|
| 359 |
+
clf = Chronos(win_size=win_size, prediction_length=1, input_c=1, model_size='base', batch_size=batch_size)
|
| 360 |
+
data =data.reshape(-1,1)
|
| 361 |
+
clf.fit(data)
|
| 362 |
+
score = clf.decision_scores_
|
| 363 |
+
return score.ravel()
|
| 364 |
+
|
| 365 |
+
def run_TimesFM(data, win_size=96):
|
| 366 |
+
from models.TimesFM import TimesFM
|
| 367 |
+
clf = TimesFM(win_size=win_size)
|
| 368 |
+
data_normalized = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
|
| 369 |
+
data_normalized = data_normalized.reshape(-1,1)
|
| 370 |
+
clf.fit(data_normalized)
|
| 371 |
+
#normalizd data:
|
| 372 |
+
score = clf.decision_scores_
|
| 373 |
+
return score.ravel()
|
| 374 |
+
|
| 375 |
+
def run_MOMENT_ZS(data, win_size=256):
|
| 376 |
+
from models.MOMENT import MOMENT
|
| 377 |
+
clf = MOMENT(win_size=win_size, input_c=1)
|
| 378 |
+
data = data.reshape(-1,1)
|
| 379 |
+
# Zero shot
|
| 380 |
+
clf.zero_shot(data)
|
| 381 |
+
score = clf.decision_scores_
|
| 382 |
+
return score.ravel()
|
| 383 |
+
|
| 384 |
+
def run_MOMENT_FT(data_train, data_test, win_size=256):
|
| 385 |
+
from models.MOMENT import MOMENT
|
| 386 |
+
clf = MOMENT(win_size=win_size, input_c=data_test.shape[1])
|
| 387 |
+
|
| 388 |
+
# Finetune
|
| 389 |
+
clf.fit(data_train)
|
| 390 |
+
score = clf.decision_function(data_test)
|
| 391 |
+
return score.ravel()
|
| 392 |
+
|
| 393 |
+
def run_M2N2(
|
| 394 |
+
data_train, data_test, win_size=12, stride=12,
|
| 395 |
+
batch_size=64, epochs=100, latent_dim=16,
|
| 396 |
+
lr=1e-3, ttlr=1e-3, normalization='Detrend',
|
| 397 |
+
gamma=0.99, th=0.9, valid_size=0.2, infer_mode='online'
|
| 398 |
+
):
|
| 399 |
+
from models.M2N2 import M2N2
|
| 400 |
+
clf = M2N2(
|
| 401 |
+
win_size=win_size, stride=stride,
|
| 402 |
+
num_channels=data_test.shape[1],
|
| 403 |
+
batch_size=batch_size, epochs=epochs,
|
| 404 |
+
latent_dim=latent_dim,
|
| 405 |
+
lr=lr, ttlr=ttlr,
|
| 406 |
+
normalization=normalization,
|
| 407 |
+
gamma=gamma, th=th, valid_size=valid_size,
|
| 408 |
+
infer_mode=infer_mode
|
| 409 |
+
)
|
| 410 |
+
clf.fit(data_train)
|
| 411 |
+
score = clf.decision_function(data_test)
|
| 412 |
+
return score.ravel()
|
| 413 |
+
|
| 414 |
+
def run_DADA(data_test, device=0, win_size=100, batch_size=32):
|
| 415 |
+
from models.DADA import DADA
|
| 416 |
+
clf = DADA(device=device, win_size=win_size, batch_size=batch_size)
|
| 417 |
+
score = clf.zero_shot(data_test)
|
| 418 |
+
return score.ravel()
|
| 419 |
+
|
| 420 |
+
def run_Time_MOE(data, device=0, win_size=64, batch_size=32):
|
| 421 |
+
from models.time_moe import Time_MOE
|
| 422 |
+
clf = Time_MOE(device=device, win_size=win_size, batch_size=batch_size)
|
| 423 |
+
score = clf.zero_shot(data)
|
| 424 |
+
return score.ravel()
|
| 425 |
+
|
| 426 |
+
def run_Time_RCD(data, **kwargs):
|
| 427 |
+
Multi = kwargs.get('Multi', False)
|
| 428 |
+
win_size = kwargs.get('win_size', 5000)
|
| 429 |
+
batch_size = kwargs.get('batch_size', 64)
|
| 430 |
+
random_mask = kwargs.get('random_mask', 'random_mask')
|
| 431 |
+
size = kwargs.get('size', 'full')
|
| 432 |
+
device = kwargs.get('device', '2') # Extract device parameter
|
| 433 |
+
"""
|
| 434 |
+
Wrapper function for Time_RCD model
|
| 435 |
+
"""
|
| 436 |
+
from models.TimeRCD import TimeRCDPretrainTester
|
| 437 |
+
from models.time_rcd.time_rcd_config import TimeRCDConfig, default_config
|
| 438 |
+
|
| 439 |
+
config = default_config
|
| 440 |
+
if Multi:
|
| 441 |
+
if size == 'small':
|
| 442 |
+
if random_mask == 'random_mask':
|
| 443 |
+
checkpoint_path = 'checkpoints/dataset_10_20.pth'
|
| 444 |
+
else:
|
| 445 |
+
checkpoint_path = 'checkpoints/full_mask_10_20.pth'
|
| 446 |
+
config.ts_config.patch_size = 16
|
| 447 |
+
else:
|
| 448 |
+
if random_mask == 'random_mask':
|
| 449 |
+
checkpoint_path = 'checkpoints/dataset_15_56.pth'
|
| 450 |
+
else:
|
| 451 |
+
checkpoint_path = 'checkpoints/full_mask_15_56.pth'
|
| 452 |
+
config.ts_config.patch_size = 32
|
| 453 |
+
else:
|
| 454 |
+
checkpoint_path = 'checkpoints/full_mask_anomaly_head_pretrain_checkpoint_best.pth'
|
| 455 |
+
config.ts_config.patch_size = 16
|
| 456 |
+
|
| 457 |
+
config.cuda_devices = device # Use the device parameter properly
|
| 458 |
+
print("Using CUDA device:", config.cuda_devices)
|
| 459 |
+
config.win_size = win_size
|
| 460 |
+
config.batch_size = batch_size
|
| 461 |
+
config.ts_config.num_features = data.shape[1]
|
| 462 |
+
print(f"Checkpoint path: {checkpoint_path}")
|
| 463 |
+
cls = TimeRCDPretrainTester(checkpoint_path, config)
|
| 464 |
+
score_list, logit_list = cls.zero_shot(data)
|
| 465 |
+
|
| 466 |
+
# Concatenate across batches robustly to avoid inhomogeneous shape errors
|
| 467 |
+
score = np.concatenate([np.asarray(s).reshape(-1) for s in score_list], axis=0)
|
| 468 |
+
logit = np.concatenate([np.asarray(l).reshape(-1) for l in logit_list], axis=0)
|
| 469 |
+
|
| 470 |
+
return score, logit
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
def run_TSPulse(data, win_size=256, batch_size=64, prediction_mode=None, aggregation_length=64,
|
| 474 |
+
aggr_function="max", smoothing_length=8, least_significant_scale=0.01,
|
| 475 |
+
least_significant_score=0.1, device=None):
|
| 476 |
+
"""
|
| 477 |
+
Wrapper function for TSPulse anomaly detection model
|
| 478 |
+
|
| 479 |
+
Parameters
|
| 480 |
+
----------
|
| 481 |
+
data : numpy.ndarray
|
| 482 |
+
Time series data of shape (n_samples, n_features)
|
| 483 |
+
win_size : int, default=256
|
| 484 |
+
Window size (for compatibility, not directly used by TSPulse)
|
| 485 |
+
batch_size : int, default=64
|
| 486 |
+
Batch size for processing
|
| 487 |
+
prediction_mode : list, optional
|
| 488 |
+
List of prediction modes. If None, uses default time and frequency reconstruction
|
| 489 |
+
aggregation_length : int, default=64
|
| 490 |
+
Length for aggregation of scores
|
| 491 |
+
aggr_function : str, default="max"
|
| 492 |
+
Aggregation function ("max", "mean", "median")
|
| 493 |
+
smoothing_length : int, default=8
|
| 494 |
+
Length for smoothing the anomaly scores
|
| 495 |
+
least_significant_scale : float, default=0.01
|
| 496 |
+
Minimum scale for significance
|
| 497 |
+
least_significant_score : float, default=0.1
|
| 498 |
+
Minimum score for significance
|
| 499 |
+
device : str, optional
|
| 500 |
+
Device to use ("cuda" or "cpu"). Auto-detected if None.
|
| 501 |
+
|
| 502 |
+
Returns
|
| 503 |
+
-------
|
| 504 |
+
numpy.ndarray
|
| 505 |
+
Anomaly scores of shape (n_samples,)
|
| 506 |
+
"""
|
| 507 |
+
from models.TSPulse import run_TSPulse as tspulse_runner
|
| 508 |
+
|
| 509 |
+
# Prepare kwargs for TSPulse
|
| 510 |
+
kwargs = {
|
| 511 |
+
'batch_size': batch_size,
|
| 512 |
+
'aggregation_length': aggregation_length,
|
| 513 |
+
'aggr_function': aggr_function,
|
| 514 |
+
'smoothing_length': smoothing_length,
|
| 515 |
+
'least_significant_scale': least_significant_scale,
|
| 516 |
+
'least_significant_score': least_significant_score,
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
if prediction_mode is not None:
|
| 520 |
+
kwargs['prediction_mode'] = prediction_mode
|
| 521 |
+
if device is not None:
|
| 522 |
+
kwargs['device'] = device
|
| 523 |
+
|
| 524 |
+
try:
|
| 525 |
+
# Run TSPulse anomaly detection
|
| 526 |
+
score = tspulse_runner(data, **kwargs)
|
| 527 |
+
return score.ravel()
|
| 528 |
+
except Exception as e:
|
| 529 |
+
print(f"Warning: TSPulse failed with error: {str(e)}")
|
| 530 |
+
print("Falling back to random scores")
|
| 531 |
+
# Return random scores as fallback
|
| 532 |
+
return np.random.random(len(data)) * 0.1
|
models/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
models/AE.py
ADDED
|
@@ -0,0 +1,407 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This function is adapted from [pyod] by [yzhao062]
|
| 3 |
+
Original source: [https://github.com/yzhao062/pyod]
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import division
|
| 7 |
+
from __future__ import print_function
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
import torch, math
|
| 11 |
+
from sklearn.utils import check_array
|
| 12 |
+
from sklearn.utils.validation import check_is_fitted
|
| 13 |
+
from torch import nn
|
| 14 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 15 |
+
|
| 16 |
+
from .feature import Window
|
| 17 |
+
from .base import BaseDetector
|
| 18 |
+
from ..utils.stat_models import pairwise_distances_no_broadcast
|
| 19 |
+
from ..utils.dataset import TSDataset
|
| 20 |
+
from ..utils.utility import get_activation_by_name
|
| 21 |
+
|
| 22 |
+
class InnerAutoencoder(nn.Module):
|
| 23 |
+
def __init__(self,
|
| 24 |
+
n_features,
|
| 25 |
+
hidden_neurons=(128, 64),
|
| 26 |
+
dropout_rate=0.2,
|
| 27 |
+
batch_norm=True,
|
| 28 |
+
hidden_activation='relu'):
|
| 29 |
+
|
| 30 |
+
# initialize the super class
|
| 31 |
+
super(InnerAutoencoder, self).__init__()
|
| 32 |
+
|
| 33 |
+
# save the default values
|
| 34 |
+
self.n_features = n_features
|
| 35 |
+
self.dropout_rate = dropout_rate
|
| 36 |
+
self.batch_norm = batch_norm
|
| 37 |
+
self.hidden_activation = hidden_activation
|
| 38 |
+
|
| 39 |
+
# create the dimensions for the input and hidden layers
|
| 40 |
+
self.layers_neurons_encoder_ = [self.n_features, *hidden_neurons]
|
| 41 |
+
self.layers_neurons_decoder_ = self.layers_neurons_encoder_[::-1]
|
| 42 |
+
|
| 43 |
+
# get the object for the activations functions
|
| 44 |
+
self.activation = get_activation_by_name(hidden_activation)
|
| 45 |
+
|
| 46 |
+
# initialize encoder and decoder as a sequential
|
| 47 |
+
self.encoder = nn.Sequential()
|
| 48 |
+
self.decoder = nn.Sequential()
|
| 49 |
+
|
| 50 |
+
# fill the encoder sequential with hidden layers
|
| 51 |
+
for idx, layer in enumerate(self.layers_neurons_encoder_[:-1]):
|
| 52 |
+
|
| 53 |
+
# create a linear layer of neurons
|
| 54 |
+
self.encoder.add_module(
|
| 55 |
+
"linear" + str(idx),
|
| 56 |
+
torch.nn.Linear(layer,self.layers_neurons_encoder_[idx + 1]))
|
| 57 |
+
|
| 58 |
+
# add a batch norm per layer if wanted (leave out first layer)
|
| 59 |
+
if batch_norm:
|
| 60 |
+
self.encoder.add_module("batch_norm" + str(idx),
|
| 61 |
+
nn.BatchNorm1d(self.layers_neurons_encoder_[idx + 1]))
|
| 62 |
+
|
| 63 |
+
# create the activation
|
| 64 |
+
self.encoder.add_module(self.hidden_activation + str(idx),
|
| 65 |
+
self.activation)
|
| 66 |
+
|
| 67 |
+
# create a dropout layer
|
| 68 |
+
self.encoder.add_module("dropout" + str(idx),
|
| 69 |
+
torch.nn.Dropout(dropout_rate))
|
| 70 |
+
|
| 71 |
+
# fill the decoder layer
|
| 72 |
+
for idx, layer in enumerate(self.layers_neurons_decoder_[:-1]):
|
| 73 |
+
|
| 74 |
+
# create a linear layer of neurons
|
| 75 |
+
self.decoder.add_module(
|
| 76 |
+
"linear" + str(idx),
|
| 77 |
+
torch.nn.Linear(layer,self.layers_neurons_decoder_[idx + 1]))
|
| 78 |
+
|
| 79 |
+
# create a batch norm per layer if wanted (only if it is not the
|
| 80 |
+
# last layer)
|
| 81 |
+
if batch_norm and idx < len(self.layers_neurons_decoder_[:-1]) - 1:
|
| 82 |
+
self.decoder.add_module("batch_norm" + str(idx),
|
| 83 |
+
nn.BatchNorm1d(self.layers_neurons_decoder_[idx + 1]))
|
| 84 |
+
|
| 85 |
+
# create the activation
|
| 86 |
+
self.decoder.add_module(self.hidden_activation + str(idx),
|
| 87 |
+
self.activation)
|
| 88 |
+
|
| 89 |
+
# create a dropout layer (only if it is not the last layer)
|
| 90 |
+
if idx < len(self.layers_neurons_decoder_[:-1]) - 1:
|
| 91 |
+
self.decoder.add_module("dropout" + str(idx),
|
| 92 |
+
torch.nn.Dropout(dropout_rate))
|
| 93 |
+
|
| 94 |
+
def forward(self, x):
|
| 95 |
+
# we could return the latent representation here after the encoder
|
| 96 |
+
# as the latent representation
|
| 97 |
+
x = self.encoder(x)
|
| 98 |
+
x = self.decoder(x)
|
| 99 |
+
return x
|
| 100 |
+
|
| 101 |
+
class AutoEncoder(BaseDetector):
|
| 102 |
+
"""Auto Encoder (AE) is a type of neural networks for learning useful data
|
| 103 |
+
representations in an unsupervised manner. Similar to PCA, AE could be used
|
| 104 |
+
to detect outlying objects in the data by calculating the reconstruction
|
| 105 |
+
errors. See :cite:`aggarwal2015outlier` Chapter 3 for details.
|
| 106 |
+
|
| 107 |
+
Notes
|
| 108 |
+
-----
|
| 109 |
+
This is the PyTorch version of AutoEncoder.
|
| 110 |
+
The documentation is not finished!
|
| 111 |
+
|
| 112 |
+
Parameters
|
| 113 |
+
----------
|
| 114 |
+
hidden_neurons : list, optional (default=[64, 32])
|
| 115 |
+
The number of neurons per hidden layers. So the network has the
|
| 116 |
+
structure as [n_features, 64, 32, 32, 64, n_features]
|
| 117 |
+
|
| 118 |
+
hidden_activation : str, optional (default='relu')
|
| 119 |
+
Activation function to use for hidden layers.
|
| 120 |
+
All hidden layers are forced to use the same type of activation.
|
| 121 |
+
See https://pytorch.org/docs/stable/nn.html for details.
|
| 122 |
+
|
| 123 |
+
batch_norm : boolean, optional (default=True)
|
| 124 |
+
Whether to apply Batch Normalization,
|
| 125 |
+
See https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html
|
| 126 |
+
|
| 127 |
+
learning_rate : float, optional (default=1e-3)
|
| 128 |
+
Learning rate for the optimizer. This learning_rate is given to
|
| 129 |
+
an Adam optimizer (torch.optim.Adam).
|
| 130 |
+
See https://pytorch.org/docs/stable/generated/torch.optim.Adam.html
|
| 131 |
+
|
| 132 |
+
epochs : int, optional (default=100)
|
| 133 |
+
Number of epochs to train the model.
|
| 134 |
+
|
| 135 |
+
batch_size : int, optional (default=32)
|
| 136 |
+
Number of samples per gradient update.
|
| 137 |
+
|
| 138 |
+
dropout_rate : float in (0., 1), optional (default=0.2)
|
| 139 |
+
The dropout to be used across all layers.
|
| 140 |
+
|
| 141 |
+
weight_decay : float, optional (default=1e-5)
|
| 142 |
+
The weight decay for Adam optimizer.
|
| 143 |
+
See https://pytorch.org/docs/stable/generated/torch.optim.Adam.html
|
| 144 |
+
|
| 145 |
+
preprocessing : bool, optional (default=True)
|
| 146 |
+
If True, apply standardization on the data.
|
| 147 |
+
|
| 148 |
+
loss_fn : obj, optional (default=torch.nn.MSELoss)
|
| 149 |
+
Optimizer instance which implements torch.nn._Loss.
|
| 150 |
+
One of https://pytorch.org/docs/stable/nn.html#loss-functions
|
| 151 |
+
or a custom loss. Custom losses are currently unstable.
|
| 152 |
+
|
| 153 |
+
verbose : int, optional (default=1)
|
| 154 |
+
Verbosity mode.
|
| 155 |
+
|
| 156 |
+
- 0 = silent
|
| 157 |
+
- 1 = progress bar
|
| 158 |
+
- 2 = one line per epoch.
|
| 159 |
+
|
| 160 |
+
For verbose >= 1, model summary may be printed.
|
| 161 |
+
!CURRENTLY NOT SUPPORTED.!
|
| 162 |
+
|
| 163 |
+
random_state : random_state: int, RandomState instance or None, optional
|
| 164 |
+
(default=None)
|
| 165 |
+
If int, random_state is the seed used by the random
|
| 166 |
+
number generator; If RandomState instance, random_state is the random
|
| 167 |
+
number generator; If None, the random number generator is the
|
| 168 |
+
RandomState instance used by `np.random`.
|
| 169 |
+
!CURRENTLY NOT SUPPORTED.!
|
| 170 |
+
|
| 171 |
+
contamination : float in (0., 0.5), optional (default=0.1)
|
| 172 |
+
The amount of contamination of the data set, i.e.
|
| 173 |
+
the proportion of outliers in the data set. When fitting this is used
|
| 174 |
+
to define the threshold on the decision function.
|
| 175 |
+
|
| 176 |
+
Attributes
|
| 177 |
+
----------
|
| 178 |
+
encoding_dim_ : int
|
| 179 |
+
The number of neurons in the encoding layer.
|
| 180 |
+
|
| 181 |
+
compression_rate_ : float
|
| 182 |
+
The ratio between the original feature and
|
| 183 |
+
the number of neurons in the encoding layer.
|
| 184 |
+
|
| 185 |
+
model_ : Keras Object
|
| 186 |
+
The underlying AutoEncoder in Keras.
|
| 187 |
+
|
| 188 |
+
history_: Keras Object
|
| 189 |
+
The AutoEncoder training history.
|
| 190 |
+
|
| 191 |
+
decision_scores_ : numpy array of shape (n_samples,)
|
| 192 |
+
The outlier scores of the training data.
|
| 193 |
+
The higher, the more abnormal. Outliers tend to have higher
|
| 194 |
+
scores. This value is available once the detector is
|
| 195 |
+
fitted.
|
| 196 |
+
|
| 197 |
+
threshold_ : float
|
| 198 |
+
The threshold is based on ``contamination``. It is the
|
| 199 |
+
``n_samples * contamination`` most abnormal samples in
|
| 200 |
+
``decision_scores_``. The threshold is calculated for generating
|
| 201 |
+
binary outlier labels.
|
| 202 |
+
|
| 203 |
+
labels_ : int, either 0 or 1
|
| 204 |
+
The binary labels of the training data. 0 stands for inliers
|
| 205 |
+
and 1 for outliers/anomalies. It is generated by applying
|
| 206 |
+
``threshold_`` on ``decision_scores_``.
|
| 207 |
+
"""
|
| 208 |
+
|
| 209 |
+
def __init__(self,
|
| 210 |
+
slidingWindow=100,
|
| 211 |
+
hidden_neurons=None,
|
| 212 |
+
hidden_activation='relu',
|
| 213 |
+
batch_norm=True,
|
| 214 |
+
learning_rate=1e-3,
|
| 215 |
+
epochs=100,
|
| 216 |
+
batch_size=32,
|
| 217 |
+
dropout_rate=0.2,
|
| 218 |
+
weight_decay=1e-5,
|
| 219 |
+
# validation_size=0.1,
|
| 220 |
+
preprocessing=True,
|
| 221 |
+
loss_fn=None,
|
| 222 |
+
verbose=False,
|
| 223 |
+
# random_state=None,
|
| 224 |
+
contamination=0.1,
|
| 225 |
+
device=None):
|
| 226 |
+
super(AutoEncoder, self).__init__(contamination=contamination)
|
| 227 |
+
|
| 228 |
+
# save the initialization values
|
| 229 |
+
self.slidingWindow = slidingWindow
|
| 230 |
+
self.hidden_neurons = hidden_neurons
|
| 231 |
+
self.hidden_activation = hidden_activation
|
| 232 |
+
self.batch_norm = batch_norm
|
| 233 |
+
self.learning_rate = learning_rate
|
| 234 |
+
self.epochs = epochs
|
| 235 |
+
self.batch_size = batch_size
|
| 236 |
+
self.dropout_rate = dropout_rate
|
| 237 |
+
self.weight_decay = weight_decay
|
| 238 |
+
self.preprocessing = preprocessing
|
| 239 |
+
self.loss_fn = loss_fn
|
| 240 |
+
self.verbose = verbose
|
| 241 |
+
self.device = device
|
| 242 |
+
|
| 243 |
+
# create default loss functions
|
| 244 |
+
if self.loss_fn is None:
|
| 245 |
+
self.loss_fn = torch.nn.MSELoss()
|
| 246 |
+
|
| 247 |
+
# create default calculation device (support GPU if available)
|
| 248 |
+
if self.device is None:
|
| 249 |
+
self.device = torch.device(
|
| 250 |
+
"cuda:0" if torch.cuda.is_available() else "cpu")
|
| 251 |
+
|
| 252 |
+
# default values for the amount of hidden neurons
|
| 253 |
+
if self.hidden_neurons is None:
|
| 254 |
+
self.hidden_neurons = [64, 32]
|
| 255 |
+
|
| 256 |
+
# noinspection PyUnresolvedReferences
|
| 257 |
+
def fit(self, X, y=None):
|
| 258 |
+
"""Fit detector. y is ignored in unsupervised methods.
|
| 259 |
+
|
| 260 |
+
Parameters
|
| 261 |
+
----------
|
| 262 |
+
X : numpy array of shape (n_samples, n_features)
|
| 263 |
+
The input samples.
|
| 264 |
+
|
| 265 |
+
y : Ignored
|
| 266 |
+
Not used, present for API consistency by convention.
|
| 267 |
+
|
| 268 |
+
Returns
|
| 269 |
+
-------
|
| 270 |
+
self : object
|
| 271 |
+
Fitted estimator.
|
| 272 |
+
"""
|
| 273 |
+
n_samples, n_features = X.shape
|
| 274 |
+
|
| 275 |
+
if n_features == 1:
|
| 276 |
+
# Converting time series data into matrix format
|
| 277 |
+
X = Window(window = self.slidingWindow).convert(X)
|
| 278 |
+
|
| 279 |
+
# validate inputs X and y (optional)
|
| 280 |
+
X = check_array(X)
|
| 281 |
+
self._set_n_classes(y)
|
| 282 |
+
|
| 283 |
+
n_samples, n_features = X.shape[0], X.shape[1]
|
| 284 |
+
X = MinMaxScaler(feature_range=(0,1)).fit_transform(X.T).T
|
| 285 |
+
|
| 286 |
+
# conduct standardization if needed
|
| 287 |
+
if self.preprocessing:
|
| 288 |
+
self.mean, self.std = np.mean(X, axis=0), np.std(X, axis=0)
|
| 289 |
+
self.std = np.where(self.std == 0, 1e-8, self.std)
|
| 290 |
+
train_set = TSDataset(X=X, mean=self.mean, std=self.std)
|
| 291 |
+
else:
|
| 292 |
+
train_set = TSDataset(X=X)
|
| 293 |
+
|
| 294 |
+
train_loader = torch.utils.data.DataLoader(train_set, batch_size=self.batch_size, shuffle=True, drop_last=True)
|
| 295 |
+
|
| 296 |
+
# initialize the model
|
| 297 |
+
self.model = InnerAutoencoder(
|
| 298 |
+
n_features=n_features,
|
| 299 |
+
hidden_neurons=self.hidden_neurons,
|
| 300 |
+
dropout_rate=self.dropout_rate,
|
| 301 |
+
batch_norm=self.batch_norm,
|
| 302 |
+
hidden_activation=self.hidden_activation)
|
| 303 |
+
|
| 304 |
+
# move to device and print model information
|
| 305 |
+
self.model = self.model.to(self.device)
|
| 306 |
+
if self.verbose:
|
| 307 |
+
print(self.model)
|
| 308 |
+
|
| 309 |
+
# train the autoencoder to find the best one
|
| 310 |
+
self._train_autoencoder(train_loader)
|
| 311 |
+
|
| 312 |
+
self.model.load_state_dict(self.best_model_dict)
|
| 313 |
+
self.decision_scores_ = self.decision_function(X)
|
| 314 |
+
|
| 315 |
+
self._process_decision_scores()
|
| 316 |
+
return self
|
| 317 |
+
|
| 318 |
+
def _train_autoencoder(self, train_loader):
|
| 319 |
+
"""Internal function to train the autoencoder
|
| 320 |
+
|
| 321 |
+
Parameters
|
| 322 |
+
----------
|
| 323 |
+
train_loader : torch dataloader
|
| 324 |
+
Train data.
|
| 325 |
+
"""
|
| 326 |
+
optimizer = torch.optim.Adam(
|
| 327 |
+
self.model.parameters(), lr=self.learning_rate,
|
| 328 |
+
weight_decay=self.weight_decay)
|
| 329 |
+
|
| 330 |
+
self.best_loss = float('inf')
|
| 331 |
+
self.best_model_dict = None
|
| 332 |
+
|
| 333 |
+
for epoch in range(self.epochs):
|
| 334 |
+
overall_loss = []
|
| 335 |
+
for data, data_idx in train_loader:
|
| 336 |
+
data = data.to(self.device).float()
|
| 337 |
+
loss = self.loss_fn(data, self.model(data))
|
| 338 |
+
|
| 339 |
+
self.model.zero_grad()
|
| 340 |
+
loss.backward()
|
| 341 |
+
optimizer.step()
|
| 342 |
+
overall_loss.append(loss.item())
|
| 343 |
+
if self.verbose:
|
| 344 |
+
print('epoch {epoch}: training loss {train_loss} '.format(
|
| 345 |
+
epoch=epoch, train_loss=np.mean(overall_loss)))
|
| 346 |
+
|
| 347 |
+
# track the best model so far
|
| 348 |
+
if np.mean(overall_loss) <= self.best_loss:
|
| 349 |
+
# print("epoch {ep} is the current best; loss={loss}".format(ep=epoch, loss=np.mean(overall_loss)))
|
| 350 |
+
self.best_loss = np.mean(overall_loss)
|
| 351 |
+
self.best_model_dict = self.model.state_dict()
|
| 352 |
+
|
| 353 |
+
def decision_function(self, X):
|
| 354 |
+
"""Predict raw anomaly score of X using the fitted detector.
|
| 355 |
+
|
| 356 |
+
The anomaly score of an input sample is computed based on different
|
| 357 |
+
detector algorithms. For consistency, outliers are assigned with
|
| 358 |
+
larger anomaly scores.
|
| 359 |
+
|
| 360 |
+
Parameters
|
| 361 |
+
----------
|
| 362 |
+
X : numpy array of shape (n_samples, n_features)
|
| 363 |
+
The training input samples. Sparse matrices are accepted only
|
| 364 |
+
if they are supported by the base estimator.
|
| 365 |
+
|
| 366 |
+
Returns
|
| 367 |
+
-------
|
| 368 |
+
anomaly_scores : numpy array of shape (n_samples,)
|
| 369 |
+
The anomaly score of the input samples.
|
| 370 |
+
"""
|
| 371 |
+
check_is_fitted(self, ['model', 'best_model_dict'])
|
| 372 |
+
|
| 373 |
+
n_samples, n_features = X.shape
|
| 374 |
+
|
| 375 |
+
if n_features == 1:
|
| 376 |
+
# Converting time series data into matrix format
|
| 377 |
+
X = Window(window = self.slidingWindow).convert(X)
|
| 378 |
+
|
| 379 |
+
X = check_array(X)
|
| 380 |
+
X = MinMaxScaler(feature_range=(0,1)).fit_transform(X.T).T
|
| 381 |
+
|
| 382 |
+
# note the shuffle may be true but should be False
|
| 383 |
+
if self.preprocessing:
|
| 384 |
+
dataset = TSDataset(X=X, mean=self.mean, std=self.std)
|
| 385 |
+
else:
|
| 386 |
+
dataset = TSDataset(X=X)
|
| 387 |
+
|
| 388 |
+
dataloader = torch.utils.data.DataLoader(dataset,
|
| 389 |
+
batch_size=self.batch_size,
|
| 390 |
+
shuffle=False)
|
| 391 |
+
# enable the evaluation mode
|
| 392 |
+
self.model.eval()
|
| 393 |
+
|
| 394 |
+
# construct the vector for holding the reconstruction error
|
| 395 |
+
outlier_scores = np.zeros([X.shape[0], ])
|
| 396 |
+
with torch.no_grad():
|
| 397 |
+
for data, data_idx in dataloader:
|
| 398 |
+
data_cuda = data.to(self.device).float()
|
| 399 |
+
# this is the outlier score
|
| 400 |
+
outlier_scores[data_idx] = pairwise_distances_no_broadcast(
|
| 401 |
+
data, self.model(data_cuda).cpu().numpy())
|
| 402 |
+
|
| 403 |
+
if outlier_scores.shape[0] < n_samples:
|
| 404 |
+
outlier_scores = np.array([outlier_scores[0]]*math.ceil((self.slidingWindow-1)/2) +
|
| 405 |
+
list(outlier_scores) + [outlier_scores[-1]]*((self.slidingWindow-1)//2))
|
| 406 |
+
|
| 407 |
+
return outlier_scores
|
models/CBLOF.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This function is adapted from [pyod] by [yzhao062]
|
| 3 |
+
Original source: [https://github.com/yzhao062/pyod]
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import division
|
| 7 |
+
from __future__ import print_function
|
| 8 |
+
import warnings
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
from scipy.spatial.distance import cdist
|
| 12 |
+
from sklearn.cluster import KMeans
|
| 13 |
+
from sklearn.utils import check_array
|
| 14 |
+
from sklearn.utils.validation import check_is_fitted
|
| 15 |
+
from sklearn.utils.estimator_checks import check_estimator
|
| 16 |
+
|
| 17 |
+
from ..utils.stat_models import pairwise_distances_no_broadcast
|
| 18 |
+
from ..utils.utility import check_parameter
|
| 19 |
+
from .base import BaseDetector
|
| 20 |
+
from ..utils.utility import zscore
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class CBLOF(BaseDetector):
|
| 24 |
+
r"""The CBLOF operator calculates the outlier score based on cluster-based
|
| 25 |
+
local outlier factor.
|
| 26 |
+
|
| 27 |
+
CBLOF takes as an input the data set and the cluster model that was
|
| 28 |
+
generated by a clustering algorithm. It classifies the clusters into small
|
| 29 |
+
clusters and large clusters using the parameters alpha and beta.
|
| 30 |
+
The anomaly score is then calculated based on the size of the cluster the
|
| 31 |
+
point belongs to as well as the distance to the nearest large cluster.
|
| 32 |
+
|
| 33 |
+
Use weighting for outlier factor based on the sizes of the clusters as
|
| 34 |
+
proposed in the original publication. Since this might lead to unexpected
|
| 35 |
+
behavior (outliers close to small clusters are not found), it is disabled
|
| 36 |
+
by default.Outliers scores are solely computed based on their distance to
|
| 37 |
+
the closest large cluster center.
|
| 38 |
+
|
| 39 |
+
By default, kMeans is used for clustering algorithm instead of
|
| 40 |
+
Squeezer algorithm mentioned in the original paper for multiple reasons.
|
| 41 |
+
|
| 42 |
+
See :cite:`he2003discovering` for details.
|
| 43 |
+
|
| 44 |
+
Parameters
|
| 45 |
+
----------
|
| 46 |
+
n_clusters : int, optional (default=8)
|
| 47 |
+
The number of clusters to form as well as the number of
|
| 48 |
+
centroids to generate.
|
| 49 |
+
|
| 50 |
+
contamination : float in (0., 0.5), optional (default=0.1)
|
| 51 |
+
The amount of contamination of the data set,
|
| 52 |
+
i.e. the proportion of outliers in the data set. Used when fitting to
|
| 53 |
+
define the threshold on the decision function.
|
| 54 |
+
|
| 55 |
+
clustering_estimator : Estimator, optional (default=None)
|
| 56 |
+
The base clustering algorithm for performing data clustering.
|
| 57 |
+
A valid clustering algorithm should be passed in. The estimator should
|
| 58 |
+
have standard sklearn APIs, fit() and predict(). The estimator should
|
| 59 |
+
have attributes ``labels_`` and ``cluster_centers_``.
|
| 60 |
+
If ``cluster_centers_`` is not in the attributes once the model is fit,
|
| 61 |
+
it is calculated as the mean of the samples in a cluster.
|
| 62 |
+
|
| 63 |
+
If not set, CBLOF uses KMeans for scalability. See
|
| 64 |
+
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
|
| 65 |
+
|
| 66 |
+
alpha : float in (0.5, 1), optional (default=0.9)
|
| 67 |
+
Coefficient for deciding small and large clusters. The ratio
|
| 68 |
+
of the number of samples in large clusters to the number of samples in
|
| 69 |
+
small clusters.
|
| 70 |
+
|
| 71 |
+
beta : int or float in (1,), optional (default=5).
|
| 72 |
+
Coefficient for deciding small and large clusters. For a list
|
| 73 |
+
sorted clusters by size `|C1|, \|C2|, ..., |Cn|, beta = |Ck|/|Ck-1|`
|
| 74 |
+
|
| 75 |
+
use_weights : bool, optional (default=False)
|
| 76 |
+
If set to True, the size of clusters are used as weights in
|
| 77 |
+
outlier score calculation.
|
| 78 |
+
|
| 79 |
+
check_estimator : bool, optional (default=False)
|
| 80 |
+
If set to True, check whether the base estimator is consistent with
|
| 81 |
+
sklearn standard.
|
| 82 |
+
|
| 83 |
+
.. warning::
|
| 84 |
+
check_estimator may throw errors with scikit-learn 0.20 above.
|
| 85 |
+
|
| 86 |
+
random_state : int, RandomState or None, optional (default=None)
|
| 87 |
+
If int, random_state is the seed used by the random
|
| 88 |
+
number generator; If RandomState instance, random_state is the random
|
| 89 |
+
number generator; If None, the random number generator is the
|
| 90 |
+
RandomState instance used by `np.random`.
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
Attributes
|
| 94 |
+
----------
|
| 95 |
+
clustering_estimator_ : Estimator, sklearn instance
|
| 96 |
+
Base estimator for clustering.
|
| 97 |
+
|
| 98 |
+
cluster_labels_ : list of shape (n_samples,)
|
| 99 |
+
Cluster assignment for the training samples.
|
| 100 |
+
|
| 101 |
+
n_clusters_ : int
|
| 102 |
+
Actual number of clusters (possibly different from n_clusters).
|
| 103 |
+
|
| 104 |
+
cluster_sizes_ : list of shape (n_clusters_,)
|
| 105 |
+
The size of each cluster once fitted with the training data.
|
| 106 |
+
|
| 107 |
+
decision_scores_ : numpy array of shape (n_samples,)
|
| 108 |
+
The outlier scores of the training data.
|
| 109 |
+
The higher, the more abnormal. Outliers tend to have higher scores.
|
| 110 |
+
This value is available once the detector is fitted.
|
| 111 |
+
|
| 112 |
+
cluster_centers_ : numpy array of shape (n_clusters_, n_features)
|
| 113 |
+
The center of each cluster.
|
| 114 |
+
|
| 115 |
+
small_cluster_labels_ : list of clusters numbers
|
| 116 |
+
The cluster assignments belonging to small clusters.
|
| 117 |
+
|
| 118 |
+
large_cluster_labels_ : list of clusters numbers
|
| 119 |
+
The cluster assignments belonging to large clusters.
|
| 120 |
+
|
| 121 |
+
threshold_ : float
|
| 122 |
+
The threshold is based on ``contamination``. It is the
|
| 123 |
+
``n_samples * contamination`` most abnormal samples in
|
| 124 |
+
``decision_scores_``. The threshold is calculated for generating
|
| 125 |
+
binary outlier labels.
|
| 126 |
+
|
| 127 |
+
labels_ : int, either 0 or 1
|
| 128 |
+
The binary labels of the training data. 0 stands for inliers
|
| 129 |
+
and 1 for outliers/anomalies. It is generated by applying
|
| 130 |
+
``threshold_`` on ``decision_scores_``.
|
| 131 |
+
"""
|
| 132 |
+
|
| 133 |
+
def __init__(self, n_clusters=8, contamination=0.1,
|
| 134 |
+
clustering_estimator=None, alpha=0.9, beta=5,
|
| 135 |
+
use_weights=False, check_estimator=False, random_state=0,
|
| 136 |
+
n_jobs=1, normalize=True):
|
| 137 |
+
super(CBLOF, self).__init__(contamination=contamination)
|
| 138 |
+
self.n_clusters = n_clusters
|
| 139 |
+
self.clustering_estimator = clustering_estimator
|
| 140 |
+
self.alpha = alpha
|
| 141 |
+
self.beta = beta
|
| 142 |
+
self.use_weights = use_weights
|
| 143 |
+
self.check_estimator = check_estimator
|
| 144 |
+
self.random_state = random_state
|
| 145 |
+
self.normalize = normalize
|
| 146 |
+
|
| 147 |
+
# noinspection PyIncorrectDocstring
|
| 148 |
+
def fit(self, X, y=None):
|
| 149 |
+
"""Fit detector. y is ignored in unsupervised methods.
|
| 150 |
+
|
| 151 |
+
Parameters
|
| 152 |
+
----------
|
| 153 |
+
X : numpy array of shape (n_samples, n_features)
|
| 154 |
+
The input samples.
|
| 155 |
+
|
| 156 |
+
y : Ignored
|
| 157 |
+
Not used, present for API consistency by convention.
|
| 158 |
+
|
| 159 |
+
Returns
|
| 160 |
+
-------
|
| 161 |
+
self : object
|
| 162 |
+
Fitted estimator.
|
| 163 |
+
"""
|
| 164 |
+
|
| 165 |
+
# validate inputs X and y (optional)
|
| 166 |
+
X = check_array(X)
|
| 167 |
+
self._set_n_classes(y)
|
| 168 |
+
n_samples, n_features = X.shape
|
| 169 |
+
if self.normalize: X = zscore(X, axis=1, ddof=1)
|
| 170 |
+
|
| 171 |
+
# check parameters
|
| 172 |
+
# number of clusters are default to 8
|
| 173 |
+
self._validate_estimator(default=KMeans(
|
| 174 |
+
n_clusters=self.n_clusters,
|
| 175 |
+
random_state=self.random_state))
|
| 176 |
+
|
| 177 |
+
self.clustering_estimator_.fit(X=X, y=y)
|
| 178 |
+
# Get the labels of the clustering results
|
| 179 |
+
# labels_ is consistent across sklearn clustering algorithms
|
| 180 |
+
self.cluster_labels_ = self.clustering_estimator_.labels_
|
| 181 |
+
self.cluster_sizes_ = np.bincount(self.cluster_labels_)
|
| 182 |
+
|
| 183 |
+
# Get the actual number of clusters
|
| 184 |
+
self.n_clusters_ = self.cluster_sizes_.shape[0]
|
| 185 |
+
|
| 186 |
+
if self.n_clusters_ != self.n_clusters:
|
| 187 |
+
warnings.warn("The chosen clustering for CBLOF forms {0} clusters"
|
| 188 |
+
"which is inconsistent with n_clusters ({1}).".
|
| 189 |
+
format(self.n_clusters_, self.n_clusters))
|
| 190 |
+
|
| 191 |
+
self._set_cluster_centers(X, n_features)
|
| 192 |
+
self._set_small_large_clusters(n_samples)
|
| 193 |
+
|
| 194 |
+
self.decision_scores_ = self._decision_function(X,
|
| 195 |
+
self.cluster_labels_)
|
| 196 |
+
|
| 197 |
+
self._process_decision_scores()
|
| 198 |
+
return self
|
| 199 |
+
|
| 200 |
+
def decision_function(self, X):
|
| 201 |
+
"""Predict raw anomaly score of X using the fitted detector.
|
| 202 |
+
|
| 203 |
+
The anomaly score of an input sample is computed based on different
|
| 204 |
+
detector algorithms. For consistency, outliers are assigned with
|
| 205 |
+
larger anomaly scores.
|
| 206 |
+
|
| 207 |
+
Parameters
|
| 208 |
+
----------
|
| 209 |
+
X : numpy array of shape (n_samples, n_features)
|
| 210 |
+
The training input samples. Sparse matrices are accepted only
|
| 211 |
+
if they are supported by the base estimator.
|
| 212 |
+
|
| 213 |
+
Returns
|
| 214 |
+
-------
|
| 215 |
+
anomaly_scores : numpy array of shape (n_samples,)
|
| 216 |
+
The anomaly score of the input samples.
|
| 217 |
+
"""
|
| 218 |
+
check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
|
| 219 |
+
X = check_array(X)
|
| 220 |
+
labels = self.clustering_estimator_.predict(X)
|
| 221 |
+
return self._decision_function(X, labels)
|
| 222 |
+
|
| 223 |
+
def _validate_estimator(self, default=None):
|
| 224 |
+
"""Check the value of alpha and beta and clustering algorithm.
|
| 225 |
+
"""
|
| 226 |
+
check_parameter(self.alpha, low=0, high=1, param_name='alpha',
|
| 227 |
+
include_left=False, include_right=False)
|
| 228 |
+
|
| 229 |
+
check_parameter(self.beta, low=1, param_name='beta',
|
| 230 |
+
include_left=False)
|
| 231 |
+
|
| 232 |
+
if self.clustering_estimator is not None:
|
| 233 |
+
self.clustering_estimator_ = self.clustering_estimator
|
| 234 |
+
else:
|
| 235 |
+
self.clustering_estimator_ = default
|
| 236 |
+
|
| 237 |
+
# make sure the base clustering algorithm is valid
|
| 238 |
+
if self.clustering_estimator_ is None:
|
| 239 |
+
raise ValueError("clustering algorithm cannot be None")
|
| 240 |
+
|
| 241 |
+
if self.check_estimator:
|
| 242 |
+
check_estimator(self.clustering_estimator_)
|
| 243 |
+
|
| 244 |
+
def _set_cluster_centers(self, X, n_features):
|
| 245 |
+
# Noted not all clustering algorithms have cluster_centers_
|
| 246 |
+
if hasattr(self.clustering_estimator_, 'cluster_centers_'):
|
| 247 |
+
self.cluster_centers_ = self.clustering_estimator_.cluster_centers_
|
| 248 |
+
else:
|
| 249 |
+
# Set the cluster center as the mean of all the samples within
|
| 250 |
+
# the cluster
|
| 251 |
+
warnings.warn("The chosen clustering for CBLOF does not have"
|
| 252 |
+
"the center of clusters. Calculate the center"
|
| 253 |
+
"as the mean of the clusters.")
|
| 254 |
+
self.cluster_centers_ = np.zeros([self.n_clusters_, n_features])
|
| 255 |
+
for i in range(self.n_clusters_):
|
| 256 |
+
self.cluster_centers_[i, :] = np.mean(
|
| 257 |
+
X[np.where(self.cluster_labels_ == i)], axis=0)
|
| 258 |
+
|
| 259 |
+
def _set_small_large_clusters(self, n_samples):
|
| 260 |
+
# Sort the index of clusters by the number of samples belonging to it
|
| 261 |
+
size_clusters = np.bincount(self.cluster_labels_)
|
| 262 |
+
|
| 263 |
+
# Sort the order from the largest to the smallest
|
| 264 |
+
sorted_cluster_indices = np.argsort(size_clusters * -1)
|
| 265 |
+
|
| 266 |
+
# Initialize the lists of index that fulfill the requirements by
|
| 267 |
+
# either alpha or beta
|
| 268 |
+
alpha_list = []
|
| 269 |
+
beta_list = []
|
| 270 |
+
|
| 271 |
+
for i in range(1, self.n_clusters_):
|
| 272 |
+
temp_sum = np.sum(size_clusters[sorted_cluster_indices[:i]])
|
| 273 |
+
if temp_sum >= n_samples * self.alpha:
|
| 274 |
+
alpha_list.append(i)
|
| 275 |
+
|
| 276 |
+
if size_clusters[sorted_cluster_indices[i - 1]] / size_clusters[
|
| 277 |
+
sorted_cluster_indices[i]] >= self.beta:
|
| 278 |
+
beta_list.append(i)
|
| 279 |
+
|
| 280 |
+
# Find the separation index fulfills both alpha and beta
|
| 281 |
+
intersection = np.intersect1d(alpha_list, beta_list)
|
| 282 |
+
|
| 283 |
+
if len(intersection) > 0:
|
| 284 |
+
self._clustering_threshold = intersection[0]
|
| 285 |
+
elif len(alpha_list) > 0:
|
| 286 |
+
self._clustering_threshold = alpha_list[0]
|
| 287 |
+
elif len(beta_list) > 0:
|
| 288 |
+
self._clustering_threshold = beta_list[0]
|
| 289 |
+
else:
|
| 290 |
+
raise ValueError("Could not form valid cluster separation. Please "
|
| 291 |
+
"change n_clusters or change clustering method")
|
| 292 |
+
|
| 293 |
+
self.small_cluster_labels_ = sorted_cluster_indices[
|
| 294 |
+
self._clustering_threshold:]
|
| 295 |
+
self.large_cluster_labels_ = sorted_cluster_indices[
|
| 296 |
+
0:self._clustering_threshold]
|
| 297 |
+
|
| 298 |
+
# No need to calculate small cluster center
|
| 299 |
+
# self.small_cluster_centers_ = self.cluster_centers_[
|
| 300 |
+
# self.small_cluster_labels_]
|
| 301 |
+
|
| 302 |
+
self._large_cluster_centers = self.cluster_centers_[
|
| 303 |
+
self.large_cluster_labels_]
|
| 304 |
+
|
| 305 |
+
def _decision_function(self, X, labels):
|
| 306 |
+
# Initialize the score array
|
| 307 |
+
scores = np.zeros([X.shape[0], ])
|
| 308 |
+
|
| 309 |
+
small_indices = np.where(
|
| 310 |
+
np.isin(labels, self.small_cluster_labels_))[0]
|
| 311 |
+
large_indices = np.where(
|
| 312 |
+
np.isin(labels, self.large_cluster_labels_))[0]
|
| 313 |
+
|
| 314 |
+
if small_indices.shape[0] != 0:
|
| 315 |
+
# Calculate the outlier factor for the samples in small clusters
|
| 316 |
+
dist_to_large_center = cdist(X[small_indices, :],
|
| 317 |
+
self._large_cluster_centers)
|
| 318 |
+
|
| 319 |
+
scores[small_indices] = np.min(dist_to_large_center, axis=1)
|
| 320 |
+
|
| 321 |
+
if large_indices.shape[0] != 0:
|
| 322 |
+
# Calculate the outlier factor for the samples in large clusters
|
| 323 |
+
large_centers = self.cluster_centers_[labels[large_indices]]
|
| 324 |
+
|
| 325 |
+
scores[large_indices] = pairwise_distances_no_broadcast(
|
| 326 |
+
X[large_indices, :], large_centers)
|
| 327 |
+
|
| 328 |
+
if self.use_weights:
|
| 329 |
+
# Weights are calculated as the number of elements in the cluster
|
| 330 |
+
scores = scores * self.cluster_sizes_[labels]
|
| 331 |
+
|
| 332 |
+
return scores.ravel()
|
models/CNN.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict
|
| 2 |
+
import torchinfo
|
| 3 |
+
import tqdm, math
|
| 4 |
+
import numpy as np
|
| 5 |
+
import torch
|
| 6 |
+
from torch import nn, optim
|
| 7 |
+
from torch.utils.data import DataLoader
|
| 8 |
+
|
| 9 |
+
from ..utils.utility import get_activation_by_name
|
| 10 |
+
from ..utils.torch_utility import EarlyStoppingTorch, get_gpu
|
| 11 |
+
from ..utils.dataset import ForecastDataset
|
| 12 |
+
|
| 13 |
+
class AdaptiveConcatPool1d(nn.Module):
|
| 14 |
+
def __init__(self):
|
| 15 |
+
super().__init__()
|
| 16 |
+
self.ap = torch.nn.AdaptiveAvgPool1d(1)
|
| 17 |
+
self.mp = torch.nn.AdaptiveAvgPool1d(1)
|
| 18 |
+
|
| 19 |
+
def forward(self, x):
|
| 20 |
+
return torch.cat([self.ap(x), self.mp(x)], 1)
|
| 21 |
+
|
| 22 |
+
class CNNModel(nn.Module):
|
| 23 |
+
def __init__(self,
|
| 24 |
+
n_features,
|
| 25 |
+
num_channel=[32, 32, 40],
|
| 26 |
+
kernel_size=3,
|
| 27 |
+
stride=1,
|
| 28 |
+
predict_time_steps=1,
|
| 29 |
+
dropout_rate=0.25,
|
| 30 |
+
hidden_activation='relu',
|
| 31 |
+
device='cpu'):
|
| 32 |
+
|
| 33 |
+
# initialize the super class
|
| 34 |
+
super(CNNModel, self).__init__()
|
| 35 |
+
|
| 36 |
+
# save the default values
|
| 37 |
+
self.n_features = n_features
|
| 38 |
+
self.dropout_rate = dropout_rate
|
| 39 |
+
self.hidden_activation = hidden_activation
|
| 40 |
+
self.kernel_size = kernel_size
|
| 41 |
+
self.stride = stride
|
| 42 |
+
self.predict_time_steps = predict_time_steps
|
| 43 |
+
self.num_channel = num_channel
|
| 44 |
+
self.device = device
|
| 45 |
+
|
| 46 |
+
# get the object for the activations functions
|
| 47 |
+
self.activation = get_activation_by_name(hidden_activation)
|
| 48 |
+
|
| 49 |
+
# initialize encoder and decoder as a sequential
|
| 50 |
+
self.conv_layers = nn.Sequential()
|
| 51 |
+
prev_channels = self.n_features
|
| 52 |
+
|
| 53 |
+
for idx, out_channels in enumerate(self.num_channel[:-1]):
|
| 54 |
+
self.conv_layers.add_module(
|
| 55 |
+
"conv" + str(idx),
|
| 56 |
+
torch.nn.Conv1d(prev_channels, self.num_channel[idx + 1],
|
| 57 |
+
self.kernel_size, self.stride))
|
| 58 |
+
self.conv_layers.add_module(self.hidden_activation + str(idx),
|
| 59 |
+
self.activation)
|
| 60 |
+
self.conv_layers.add_module("pool" + str(idx), nn.MaxPool1d(kernel_size=2))
|
| 61 |
+
prev_channels = out_channels
|
| 62 |
+
|
| 63 |
+
self.fc = nn.Sequential(
|
| 64 |
+
AdaptiveConcatPool1d(),
|
| 65 |
+
torch.nn.Flatten(),
|
| 66 |
+
torch.nn.Linear(2*self.num_channel[-1], self.num_channel[-1]),
|
| 67 |
+
torch.nn.ReLU(),
|
| 68 |
+
torch.nn.Dropout(dropout_rate),
|
| 69 |
+
torch.nn.Linear(self.num_channel[-1], self.n_features)
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
def forward(self, x):
|
| 73 |
+
b, l, c = x.shape
|
| 74 |
+
x = x.view(b, c, l)
|
| 75 |
+
x = self.conv_layers(x) # [128, feature, 23]
|
| 76 |
+
|
| 77 |
+
outputs = torch.zeros(self.predict_time_steps, b, self.n_features).to(self.device)
|
| 78 |
+
for t in range(self.predict_time_steps):
|
| 79 |
+
decoder_input = self.fc(x)
|
| 80 |
+
outputs[t] = torch.squeeze(decoder_input, dim=-2)
|
| 81 |
+
|
| 82 |
+
return outputs
|
| 83 |
+
|
| 84 |
+
class CNN():
|
| 85 |
+
def __init__(self,
|
| 86 |
+
window_size=100,
|
| 87 |
+
pred_len=1,
|
| 88 |
+
batch_size=128,
|
| 89 |
+
epochs=50,
|
| 90 |
+
lr=0.0008,
|
| 91 |
+
feats=1,
|
| 92 |
+
num_channel=[32, 32, 40],
|
| 93 |
+
validation_size=0.2):
|
| 94 |
+
super().__init__()
|
| 95 |
+
self.__anomaly_score = None
|
| 96 |
+
|
| 97 |
+
cuda = True
|
| 98 |
+
self.y_hats = None
|
| 99 |
+
|
| 100 |
+
self.cuda = cuda
|
| 101 |
+
self.device = get_gpu(self.cuda)
|
| 102 |
+
|
| 103 |
+
self.window_size = window_size
|
| 104 |
+
self.pred_len = pred_len
|
| 105 |
+
self.batch_size = batch_size
|
| 106 |
+
self.epochs = epochs
|
| 107 |
+
|
| 108 |
+
self.feats = feats
|
| 109 |
+
self.num_channel = num_channel
|
| 110 |
+
self.lr = lr
|
| 111 |
+
self.validation_size = validation_size
|
| 112 |
+
|
| 113 |
+
self.model = CNNModel(n_features=feats, num_channel=num_channel, predict_time_steps=self.pred_len, device=self.device).to(self.device)
|
| 114 |
+
|
| 115 |
+
self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
|
| 116 |
+
self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=5, gamma=0.75)
|
| 117 |
+
self.loss = nn.MSELoss()
|
| 118 |
+
self.save_path = None
|
| 119 |
+
self.early_stopping = EarlyStoppingTorch(save_path=self.save_path, patience=3)
|
| 120 |
+
|
| 121 |
+
self.mu = None
|
| 122 |
+
self.sigma = None
|
| 123 |
+
self.eps = 1e-10
|
| 124 |
+
|
| 125 |
+
def fit(self, data):
|
| 126 |
+
tsTrain = data[:int((1-self.validation_size)*len(data))]
|
| 127 |
+
tsValid = data[int((1-self.validation_size)*len(data)):]
|
| 128 |
+
|
| 129 |
+
train_loader = DataLoader(
|
| 130 |
+
ForecastDataset(tsTrain, window_size=self.window_size, pred_len=self.pred_len),
|
| 131 |
+
batch_size=self.batch_size,
|
| 132 |
+
shuffle=True)
|
| 133 |
+
|
| 134 |
+
valid_loader = DataLoader(
|
| 135 |
+
ForecastDataset(tsValid, window_size=self.window_size, pred_len=self.pred_len),
|
| 136 |
+
batch_size=self.batch_size,
|
| 137 |
+
shuffle=False)
|
| 138 |
+
|
| 139 |
+
for epoch in range(1, self.epochs + 1):
|
| 140 |
+
self.model.train(mode=True)
|
| 141 |
+
avg_loss = 0
|
| 142 |
+
loop = tqdm.tqdm(enumerate(train_loader),total=len(train_loader),leave=True)
|
| 143 |
+
for idx, (x, target) in loop:
|
| 144 |
+
x, target = x.to(self.device), target.to(self.device)
|
| 145 |
+
|
| 146 |
+
# print('x: ', x.shape) # (bs, win, feat)
|
| 147 |
+
# print('target: ', target.shape) # # (bs, pred_len, feat)
|
| 148 |
+
# print('len(tsTrain): ', len(tsTrain))
|
| 149 |
+
# print('len(train_loader): ', len(train_loader))
|
| 150 |
+
|
| 151 |
+
self.optimizer.zero_grad()
|
| 152 |
+
|
| 153 |
+
output = self.model(x)
|
| 154 |
+
output = output.view(-1, self.feats*self.pred_len)
|
| 155 |
+
target = target.view(-1, self.feats*self.pred_len)
|
| 156 |
+
|
| 157 |
+
loss = self.loss(output, target)
|
| 158 |
+
loss.backward()
|
| 159 |
+
|
| 160 |
+
self.optimizer.step()
|
| 161 |
+
|
| 162 |
+
avg_loss += loss.cpu().item()
|
| 163 |
+
loop.set_description(f'Training Epoch [{epoch}/{self.epochs}]')
|
| 164 |
+
loop.set_postfix(loss=loss.item(), avg_loss=avg_loss/(idx+1))
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
self.model.eval()
|
| 168 |
+
scores = []
|
| 169 |
+
avg_loss = 0
|
| 170 |
+
loop = tqdm.tqdm(enumerate(valid_loader),total=len(valid_loader),leave=True)
|
| 171 |
+
with torch.no_grad():
|
| 172 |
+
for idx, (x, target) in loop:
|
| 173 |
+
x, target = x.to(self.device), target.to(self.device)
|
| 174 |
+
|
| 175 |
+
output = self.model(x)
|
| 176 |
+
|
| 177 |
+
output = output.view(-1, self.feats*self.pred_len)
|
| 178 |
+
target = target.view(-1, self.feats*self.pred_len)
|
| 179 |
+
|
| 180 |
+
loss = self.loss(output, target)
|
| 181 |
+
avg_loss += loss.cpu().item()
|
| 182 |
+
loop.set_description(f'Validation Epoch [{epoch}/{self.epochs}]')
|
| 183 |
+
loop.set_postfix(loss=loss.item(), avg_loss=avg_loss/(idx+1))
|
| 184 |
+
|
| 185 |
+
mse = torch.sub(output, target).pow(2)
|
| 186 |
+
scores.append(mse.cpu())
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
valid_loss = avg_loss/max(len(valid_loader), 1)
|
| 190 |
+
self.scheduler.step()
|
| 191 |
+
|
| 192 |
+
self.early_stopping(valid_loss, self.model)
|
| 193 |
+
if self.early_stopping.early_stop or epoch == self.epochs - 1:
|
| 194 |
+
# fitting Gaussian Distribution
|
| 195 |
+
if len(scores) > 0:
|
| 196 |
+
scores = torch.cat(scores, dim=0)
|
| 197 |
+
self.mu = torch.mean(scores)
|
| 198 |
+
self.sigma = torch.var(scores)
|
| 199 |
+
print(self.mu.size(), self.sigma.size())
|
| 200 |
+
if self.early_stopping.early_stop:
|
| 201 |
+
print(" Early stopping<<<")
|
| 202 |
+
break
|
| 203 |
+
|
| 204 |
+
def decision_function(self, data):
|
| 205 |
+
test_loader = DataLoader(
|
| 206 |
+
ForecastDataset(data, window_size=self.window_size, pred_len=self.pred_len),
|
| 207 |
+
batch_size=self.batch_size,
|
| 208 |
+
shuffle=False
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
self.model.eval()
|
| 212 |
+
scores = []
|
| 213 |
+
y_hats = []
|
| 214 |
+
loop = tqdm.tqdm(enumerate(test_loader),total=len(test_loader),leave=True)
|
| 215 |
+
with torch.no_grad():
|
| 216 |
+
for idx, (x, target) in loop:
|
| 217 |
+
x, target = x.to(self.device), target.to(self.device)
|
| 218 |
+
output = self.model(x)
|
| 219 |
+
|
| 220 |
+
output = output.view(-1, self.feats*self.pred_len)
|
| 221 |
+
target = target.view(-1, self.feats*self.pred_len)
|
| 222 |
+
|
| 223 |
+
mse = torch.sub(output, target).pow(2)
|
| 224 |
+
|
| 225 |
+
y_hats.append(output.cpu())
|
| 226 |
+
scores.append(mse.cpu())
|
| 227 |
+
loop.set_description(f'Testing: ')
|
| 228 |
+
|
| 229 |
+
scores = torch.cat(scores, dim=0)
|
| 230 |
+
# scores = 0.5 * (torch.log(self.sigma + self.eps) + (scores - self.mu)**2 / (self.sigma+self.eps))
|
| 231 |
+
|
| 232 |
+
scores = scores.numpy()
|
| 233 |
+
scores = np.mean(scores, axis=1)
|
| 234 |
+
|
| 235 |
+
y_hats = torch.cat(y_hats, dim=0)
|
| 236 |
+
y_hats = y_hats.numpy()
|
| 237 |
+
|
| 238 |
+
l, w = y_hats.shape
|
| 239 |
+
|
| 240 |
+
# new_scores = np.zeros((l - self.pred_len, w))
|
| 241 |
+
# for i in range(w):
|
| 242 |
+
# new_scores[:, i] = scores[self.pred_len - i:l-i, i]
|
| 243 |
+
# scores = np.mean(new_scores, axis=1)
|
| 244 |
+
# scores = np.pad(scores, (0, self.pred_len - 1), 'constant', constant_values=(0,0))
|
| 245 |
+
|
| 246 |
+
# new_y_hats = np.zeros((l - self.pred_len, w))
|
| 247 |
+
# for i in range(w):
|
| 248 |
+
# new_y_hats[:, i] = y_hats[self.pred_len - i:l-i, i]
|
| 249 |
+
# y_hats = np.mean(new_y_hats, axis=1)
|
| 250 |
+
# y_hats = np.pad(y_hats, (0, self.pred_len - 1), 'constant',constant_values=(0,0))
|
| 251 |
+
|
| 252 |
+
assert scores.ndim == 1
|
| 253 |
+
# self.y_hats = y_hats
|
| 254 |
+
|
| 255 |
+
print('scores: ', scores.shape)
|
| 256 |
+
if scores.shape[0] < len(data):
|
| 257 |
+
padded_decision_scores_ = np.zeros(len(data))
|
| 258 |
+
padded_decision_scores_[: self.window_size+self.pred_len-1] = scores[0]
|
| 259 |
+
padded_decision_scores_[self.window_size+self.pred_len-1 : ] = scores
|
| 260 |
+
|
| 261 |
+
self.__anomaly_score = padded_decision_scores_
|
| 262 |
+
return padded_decision_scores_
|
| 263 |
+
|
| 264 |
+
def anomaly_score(self) -> np.ndarray:
|
| 265 |
+
return self.__anomaly_score
|
| 266 |
+
|
| 267 |
+
def get_y_hat(self) -> np.ndarray:
|
| 268 |
+
return self.y_hats
|
| 269 |
+
|
| 270 |
+
def param_statistic(self, save_file):
|
| 271 |
+
model_stats = torchinfo.summary(self.model, (self.batch_size, self.window_size), verbose=0)
|
| 272 |
+
with open(save_file, 'w') as f:
|
| 273 |
+
f.write(str(model_stats))
|
models/COF.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
This function is adapted from [pyod] by [yzhao062]
|
| 4 |
+
Original source: [https://github.com/yzhao062/pyod]
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import division
|
| 8 |
+
from __future__ import print_function
|
| 9 |
+
|
| 10 |
+
import warnings
|
| 11 |
+
from operator import itemgetter
|
| 12 |
+
|
| 13 |
+
import numpy as np
|
| 14 |
+
from scipy.spatial import distance_matrix
|
| 15 |
+
from scipy.spatial import minkowski_distance
|
| 16 |
+
from sklearn.utils import check_array
|
| 17 |
+
|
| 18 |
+
from .base import BaseDetector
|
| 19 |
+
from ..utils.utility import check_parameter
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class COF(BaseDetector):
|
| 23 |
+
"""Connectivity-Based Outlier Factor (COF) COF uses the ratio of average
|
| 24 |
+
chaining distance of data point and the average of average chaining
|
| 25 |
+
distance of k nearest neighbor of the data point, as the outlier score
|
| 26 |
+
for observations.
|
| 27 |
+
|
| 28 |
+
See :cite:`tang2002enhancing` for details.
|
| 29 |
+
|
| 30 |
+
Two version of COF are supported:
|
| 31 |
+
|
| 32 |
+
- Fast COF: computes the entire pairwise distance matrix at the cost of a
|
| 33 |
+
O(n^2) memory requirement.
|
| 34 |
+
- Memory efficient COF: calculates pairwise distances incrementally.
|
| 35 |
+
Use this implementation when it is not feasible to fit the n-by-n
|
| 36 |
+
distance in memory. This leads to a linear overhead because many
|
| 37 |
+
distances will have to be recalculated.
|
| 38 |
+
|
| 39 |
+
Parameters
|
| 40 |
+
----------
|
| 41 |
+
contamination : float in (0., 0.5), optional (default=0.1)
|
| 42 |
+
The amount of contamination of the data set, i.e.
|
| 43 |
+
the proportion of outliers in the data set. Used when fitting to
|
| 44 |
+
define the threshold on the decision function.
|
| 45 |
+
|
| 46 |
+
n_neighbors : int, optional (default=20)
|
| 47 |
+
Number of neighbors to use by default for k neighbors queries.
|
| 48 |
+
Note that n_neighbors should be less than the number of samples.
|
| 49 |
+
If n_neighbors is larger than the number of samples provided,
|
| 50 |
+
all samples will be used.
|
| 51 |
+
|
| 52 |
+
method : string, optional (default='fast')
|
| 53 |
+
Valid values for method are:
|
| 54 |
+
|
| 55 |
+
- 'fast' Fast COF, computes the full pairwise distance matrix up front.
|
| 56 |
+
- 'memory' Memory-efficient COF, computes pairwise distances only when
|
| 57 |
+
needed at the cost of computational speed.
|
| 58 |
+
|
| 59 |
+
Attributes
|
| 60 |
+
----------
|
| 61 |
+
decision_scores_ : numpy array of shape (n_samples,)
|
| 62 |
+
The outlier scores of the training data.
|
| 63 |
+
The higher, the more abnormal. Outliers tend to have higher
|
| 64 |
+
scores. This value is available once the detector is
|
| 65 |
+
fitted.
|
| 66 |
+
|
| 67 |
+
threshold_ : float
|
| 68 |
+
The threshold is based on ``contamination``. It is the
|
| 69 |
+
``n_samples * contamination`` most abnormal samples in
|
| 70 |
+
``decision_scores_``. The threshold is calculated for generating
|
| 71 |
+
binary outlier labels.
|
| 72 |
+
|
| 73 |
+
labels_ : int, either 0 or 1
|
| 74 |
+
The binary labels of the training data. 0 stands for inliers
|
| 75 |
+
and 1 for outliers/anomalies. It is generated by applying
|
| 76 |
+
``threshold_`` on ``decision_scores_``.
|
| 77 |
+
|
| 78 |
+
n_neighbors_: int
|
| 79 |
+
Number of neighbors to use by default for k neighbors queries.
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
def __init__(self, contamination=0.1, n_neighbors=20, method="fast"):
|
| 83 |
+
super(COF, self).__init__(contamination=contamination)
|
| 84 |
+
if isinstance(n_neighbors, int):
|
| 85 |
+
check_parameter(n_neighbors, low=1, param_name='n_neighbors')
|
| 86 |
+
else:
|
| 87 |
+
raise TypeError(
|
| 88 |
+
"n_neighbors should be int. Got %s" % type(n_neighbors))
|
| 89 |
+
self.n_neighbors = n_neighbors
|
| 90 |
+
self.method = method
|
| 91 |
+
|
| 92 |
+
def fit(self, X, y=None):
|
| 93 |
+
"""Fit detector. y is ignored in unsupervised methods.
|
| 94 |
+
|
| 95 |
+
Parameters
|
| 96 |
+
----------
|
| 97 |
+
X : numpy array of shape (n_samples, n_features)
|
| 98 |
+
The input samples.
|
| 99 |
+
|
| 100 |
+
y : Ignored
|
| 101 |
+
Not used, present for API consistency by convention.
|
| 102 |
+
|
| 103 |
+
Returns
|
| 104 |
+
-------
|
| 105 |
+
self : object
|
| 106 |
+
Fitted estimator.
|
| 107 |
+
"""
|
| 108 |
+
X = check_array(X)
|
| 109 |
+
self.n_train_ = X.shape[0]
|
| 110 |
+
self.n_neighbors_ = self.n_neighbors
|
| 111 |
+
|
| 112 |
+
if self.n_neighbors_ >= self.n_train_:
|
| 113 |
+
self.n_neighbors_ = self.n_train_ - 1
|
| 114 |
+
warnings.warn(
|
| 115 |
+
"n_neighbors is set to the number of training points "
|
| 116 |
+
"minus 1: {0}".format(self.n_neighbors_))
|
| 117 |
+
|
| 118 |
+
check_parameter(self.n_neighbors_, 1, self.n_train_,
|
| 119 |
+
include_left=True, include_right=True)
|
| 120 |
+
|
| 121 |
+
self._set_n_classes(y)
|
| 122 |
+
self.decision_scores_ = self.decision_function(X)
|
| 123 |
+
self._process_decision_scores()
|
| 124 |
+
|
| 125 |
+
return self
|
| 126 |
+
|
| 127 |
+
def decision_function(self, X):
|
| 128 |
+
"""Predict raw anomaly score of X using the fitted detector.
|
| 129 |
+
The anomaly score of an input sample is computed based on different
|
| 130 |
+
detector algorithms. For consistency, outliers are assigned with
|
| 131 |
+
larger anomaly scores.
|
| 132 |
+
|
| 133 |
+
Parameters
|
| 134 |
+
----------
|
| 135 |
+
X : numpy array of shape (n_samples, n_features)
|
| 136 |
+
The training input samples. Sparse matrices are accepted only
|
| 137 |
+
if they are supported by the base estimator.
|
| 138 |
+
|
| 139 |
+
Returns
|
| 140 |
+
-------
|
| 141 |
+
anomaly_scores : numpy array of shape (n_samples,)
|
| 142 |
+
The anomaly score of the input samples.
|
| 143 |
+
"""
|
| 144 |
+
if self.method.lower() == "fast":
|
| 145 |
+
return self._cof_fast(X)
|
| 146 |
+
elif self.method.lower() == "memory":
|
| 147 |
+
return self._cof_memory(X)
|
| 148 |
+
else:
|
| 149 |
+
raise ValueError("method should be set to either \'fast\' or \'memory\'. Got %s" % self.method)
|
| 150 |
+
|
| 151 |
+
def _cof_memory(self, X):
|
| 152 |
+
"""
|
| 153 |
+
Connectivity-Based Outlier Factor (COF) Algorithm
|
| 154 |
+
This function is called internally to calculate the
|
| 155 |
+
Connectivity-Based Outlier Factor (COF) as an outlier
|
| 156 |
+
score for observations.
|
| 157 |
+
This function uses a memory efficient implementation at the cost of
|
| 158 |
+
speed.
|
| 159 |
+
:return: numpy array containing COF scores for observations.
|
| 160 |
+
The greater the COF, the greater the outlierness.
|
| 161 |
+
"""
|
| 162 |
+
#dist_matrix = np.array(distance_matrix(X, X))
|
| 163 |
+
sbn_path_index = np.zeros((X.shape[0],self.n_neighbors_), dtype=np.int64)
|
| 164 |
+
ac_dist, cof_ = np.zeros((X.shape[0])), np.zeros((X.shape[0]))
|
| 165 |
+
for i in range(X.shape[0]):
|
| 166 |
+
#sbn_path = np.argsort(dist_matrix[i])
|
| 167 |
+
sbn_path = np.argsort(minkowski_distance(X[i,:],X,p=2))
|
| 168 |
+
sbn_path_index[i,:] = sbn_path[1: self.n_neighbors_ + 1]
|
| 169 |
+
cost_desc = np.zeros((self.n_neighbors_))
|
| 170 |
+
for j in range(self.n_neighbors_):
|
| 171 |
+
#cost_desc.append(
|
| 172 |
+
# np.min(dist_matrix[sbn_path[j + 1]][sbn_path][:j + 1]))
|
| 173 |
+
cost_desc[j] = np.min(minkowski_distance(X[sbn_path[j + 1]],X,p=2)[sbn_path][:j + 1])
|
| 174 |
+
acd = np.zeros((self.n_neighbors_))
|
| 175 |
+
for _h, cost_ in enumerate(cost_desc):
|
| 176 |
+
neighbor_add1 = self.n_neighbors_ + 1
|
| 177 |
+
acd[_h] = ((2. * (neighbor_add1 - (_h + 1))) / (neighbor_add1 * self.n_neighbors_)) * cost_
|
| 178 |
+
ac_dist[i] = np.sum(acd)
|
| 179 |
+
for _g in range(X.shape[0]):
|
| 180 |
+
cof_[_g] = (ac_dist[_g] * self.n_neighbors_) / np.sum(ac_dist[sbn_path_index[_g]])
|
| 181 |
+
return np.nan_to_num(cof_)
|
| 182 |
+
|
| 183 |
+
def _cof_fast(self, X):
|
| 184 |
+
"""
|
| 185 |
+
Connectivity-Based Outlier Factor (COF) Algorithm
|
| 186 |
+
This function is called internally to calculate the
|
| 187 |
+
Connectivity-Based Outlier Factor (COF) as an outlier
|
| 188 |
+
score for observations.
|
| 189 |
+
This function uses a fast implementation at the cost of memory.
|
| 190 |
+
:return: numpy array containing COF scores for observations.
|
| 191 |
+
The greater the COF, the greater the outlierness.
|
| 192 |
+
"""
|
| 193 |
+
dist_matrix = np.array(distance_matrix(X, X))
|
| 194 |
+
sbn_path_index, ac_dist, cof_ = [], [], []
|
| 195 |
+
for i in range(X.shape[0]):
|
| 196 |
+
sbn_path = np.argsort(dist_matrix[i])
|
| 197 |
+
sbn_path_index.append(sbn_path[1: self.n_neighbors_ + 1])
|
| 198 |
+
cost_desc = []
|
| 199 |
+
for j in range(self.n_neighbors_):
|
| 200 |
+
cost_desc.append(
|
| 201 |
+
np.min(dist_matrix[sbn_path[j + 1]][sbn_path][:j + 1]))
|
| 202 |
+
acd = []
|
| 203 |
+
for _h, cost_ in enumerate(cost_desc):
|
| 204 |
+
neighbor_add1 = self.n_neighbors_ + 1
|
| 205 |
+
acd.append(((2. * (neighbor_add1 - (_h + 1))) / (
|
| 206 |
+
neighbor_add1 * self.n_neighbors_)) * cost_)
|
| 207 |
+
ac_dist.append(np.sum(acd))
|
| 208 |
+
for _g in range(X.shape[0]):
|
| 209 |
+
cof_.append((ac_dist[_g] * self.n_neighbors_) /
|
| 210 |
+
np.sum(itemgetter(*sbn_path_index[_g])(ac_dist)))
|
| 211 |
+
return np.nan_to_num(cof_)
|
models/COPOD.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This function is adapted from [pyod] by [yzhao062]
|
| 3 |
+
Original source: [https://github.com/yzhao062/pyod]
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import division
|
| 7 |
+
from __future__ import print_function
|
| 8 |
+
import warnings
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
from joblib import Parallel, delayed
|
| 13 |
+
from scipy.stats import skew as skew_sp
|
| 14 |
+
from sklearn.utils.validation import check_is_fitted
|
| 15 |
+
from sklearn.utils import check_array
|
| 16 |
+
|
| 17 |
+
from .base import BaseDetector
|
| 18 |
+
from ..utils.stat_models import column_ecdf
|
| 19 |
+
from ..utils.utility import _partition_estimators
|
| 20 |
+
from ..utils.utility import zscore
|
| 21 |
+
|
| 22 |
+
def skew(X, axis=0):
|
| 23 |
+
return np.nan_to_num(skew_sp(X, axis=axis))
|
| 24 |
+
|
| 25 |
+
def _parallel_ecdf(n_dims, X):
|
| 26 |
+
"""Private method to calculate ecdf in parallel.
|
| 27 |
+
Parameters
|
| 28 |
+
----------
|
| 29 |
+
n_dims : int
|
| 30 |
+
The number of dimensions of the current input matrix
|
| 31 |
+
|
| 32 |
+
X : numpy array
|
| 33 |
+
The subarray for building the ECDF
|
| 34 |
+
|
| 35 |
+
Returns
|
| 36 |
+
-------
|
| 37 |
+
U_l_mat : numpy array
|
| 38 |
+
ECDF subarray.
|
| 39 |
+
|
| 40 |
+
U_r_mat : numpy array
|
| 41 |
+
ECDF subarray.
|
| 42 |
+
"""
|
| 43 |
+
U_l_mat = np.zeros([X.shape[0], n_dims])
|
| 44 |
+
U_r_mat = np.zeros([X.shape[0], n_dims])
|
| 45 |
+
|
| 46 |
+
for i in range(n_dims):
|
| 47 |
+
U_l_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1])
|
| 48 |
+
U_r_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1] * -1)
|
| 49 |
+
return U_l_mat, U_r_mat
|
| 50 |
+
|
| 51 |
+
class COPOD(BaseDetector):
|
| 52 |
+
"""COPOD class for Copula Based Outlier Detector.
|
| 53 |
+
COPOD is a parameter-free, highly interpretable outlier detection algorithm
|
| 54 |
+
based on empirical copula models.
|
| 55 |
+
See :cite:`li2020copod` for details.
|
| 56 |
+
|
| 57 |
+
Parameters
|
| 58 |
+
----------
|
| 59 |
+
contamination : float in (0., 0.5), optional (default=0.1)
|
| 60 |
+
The amount of contamination of the data set, i.e.
|
| 61 |
+
the proportion of outliers in the data set. Used when fitting to
|
| 62 |
+
define the threshold on the decision function.
|
| 63 |
+
|
| 64 |
+
n_jobs : optional (default=1)
|
| 65 |
+
The number of jobs to run in parallel for both `fit` and
|
| 66 |
+
`predict`. If -1, then the number of jobs is set to the
|
| 67 |
+
number of cores.
|
| 68 |
+
|
| 69 |
+
Attributes
|
| 70 |
+
----------
|
| 71 |
+
decision_scores_ : numpy array of shape (n_samples,)
|
| 72 |
+
The outlier scores of the training data.
|
| 73 |
+
The higher, the more abnormal. Outliers tend to have higher
|
| 74 |
+
scores. This value is available once the detector is
|
| 75 |
+
fitted.
|
| 76 |
+
threshold_ : float
|
| 77 |
+
The threshold is based on ``contamination``. It is the
|
| 78 |
+
``n_samples * contamination`` most abnormal samples in
|
| 79 |
+
``decision_scores_``. The threshold is calculated for generating
|
| 80 |
+
binary outlier labels.
|
| 81 |
+
labels_ : int, either 0 or 1
|
| 82 |
+
The binary labels of the training data. 0 stands for inliers
|
| 83 |
+
and 1 for outliers/anomalies. It is generated by applying
|
| 84 |
+
``threshold_`` on ``decision_scores_``.
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
def __init__(self, contamination=0.1, n_jobs=1, normalize=True):
|
| 88 |
+
super(COPOD, self).__init__(contamination=contamination)
|
| 89 |
+
|
| 90 |
+
#TODO: Make it parameterized for n_jobs
|
| 91 |
+
self.n_jobs = n_jobs
|
| 92 |
+
self.normalize = normalize
|
| 93 |
+
|
| 94 |
+
def fit(self, X, y=None):
|
| 95 |
+
"""Fit detector. y is ignored in unsupervised methods.
|
| 96 |
+
Parameters
|
| 97 |
+
----------
|
| 98 |
+
X : numpy array of shape (n_samples, n_features)
|
| 99 |
+
The input samples.
|
| 100 |
+
y : Ignored
|
| 101 |
+
Not used, present for API consistency by convention.
|
| 102 |
+
Returns
|
| 103 |
+
-------
|
| 104 |
+
self : object
|
| 105 |
+
Fitted estimator.
|
| 106 |
+
"""
|
| 107 |
+
X = check_array(X)
|
| 108 |
+
if self.normalize: X = zscore(X, axis=1, ddof=1)
|
| 109 |
+
|
| 110 |
+
self._set_n_classes(y)
|
| 111 |
+
self.decision_scores_ = self.decision_function(X)
|
| 112 |
+
self.X_train = X
|
| 113 |
+
self._process_decision_scores()
|
| 114 |
+
return self
|
| 115 |
+
|
| 116 |
+
def decision_function(self, X):
|
| 117 |
+
"""Predict raw anomaly score of X using the fitted detector.
|
| 118 |
+
For consistency, outliers are assigned with larger anomaly scores.
|
| 119 |
+
Parameters
|
| 120 |
+
----------
|
| 121 |
+
X : numpy array of shape (n_samples, n_features)
|
| 122 |
+
The training input samples. Sparse matrices are accepted only
|
| 123 |
+
if they are supported by the base estimator.
|
| 124 |
+
Returns
|
| 125 |
+
-------
|
| 126 |
+
anomaly_scores : numpy array of shape (n_samples,)
|
| 127 |
+
The anomaly score of the input samples.
|
| 128 |
+
"""
|
| 129 |
+
# use multi-thread execution
|
| 130 |
+
if self.n_jobs != 1:
|
| 131 |
+
return self._decision_function_parallel(X)
|
| 132 |
+
if hasattr(self, 'X_train'):
|
| 133 |
+
original_size = X.shape[0]
|
| 134 |
+
X = np.concatenate((self.X_train, X), axis=0)
|
| 135 |
+
self.U_l = -1 * np.log(column_ecdf(X))
|
| 136 |
+
self.U_r = -1 * np.log(column_ecdf(-X))
|
| 137 |
+
|
| 138 |
+
skewness = np.sign(skew(X, axis=0))
|
| 139 |
+
self.U_skew = self.U_l * -1 * np.sign(
|
| 140 |
+
skewness - 1) + self.U_r * np.sign(skewness + 1)
|
| 141 |
+
self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
|
| 142 |
+
if hasattr(self, 'X_train'):
|
| 143 |
+
decision_scores_ = self.O.sum(axis=1)[-original_size:]
|
| 144 |
+
else:
|
| 145 |
+
decision_scores_ = self.O.sum(axis=1)
|
| 146 |
+
return decision_scores_.ravel()
|
| 147 |
+
|
| 148 |
+
def _decision_function_parallel(self, X):
|
| 149 |
+
"""Predict raw anomaly score of X using the fitted detector.
|
| 150 |
+
For consistency, outliers are assigned with larger anomaly scores.
|
| 151 |
+
Parameters
|
| 152 |
+
----------
|
| 153 |
+
X : numpy array of shape (n_samples, n_features)
|
| 154 |
+
The training input samples. Sparse matrices are accepted only
|
| 155 |
+
if they are supported by the base estimator.
|
| 156 |
+
Returns
|
| 157 |
+
-------
|
| 158 |
+
anomaly_scores : numpy array of shape (n_samples,)
|
| 159 |
+
The anomaly score of the input samples.
|
| 160 |
+
"""
|
| 161 |
+
if hasattr(self, 'X_train'):
|
| 162 |
+
original_size = X.shape[0]
|
| 163 |
+
X = np.concatenate((self.X_train, X), axis=0)
|
| 164 |
+
|
| 165 |
+
n_samples, n_features = X.shape[0], X.shape[1]
|
| 166 |
+
|
| 167 |
+
if n_features < 2:
|
| 168 |
+
raise ValueError(
|
| 169 |
+
'n_jobs should not be used on one dimensional dataset')
|
| 170 |
+
|
| 171 |
+
if n_features <= self.n_jobs:
|
| 172 |
+
self.n_jobs = n_features
|
| 173 |
+
warnings.warn("n_features <= n_jobs; setting them equal instead.")
|
| 174 |
+
|
| 175 |
+
n_jobs, n_dims_list, starts = _partition_estimators(n_features,
|
| 176 |
+
self.n_jobs)
|
| 177 |
+
|
| 178 |
+
all_results = Parallel(n_jobs=n_jobs, max_nbytes=None,
|
| 179 |
+
verbose=True)(
|
| 180 |
+
delayed(_parallel_ecdf)(
|
| 181 |
+
n_dims_list[i],
|
| 182 |
+
X[:, starts[i]:starts[i + 1]],
|
| 183 |
+
)
|
| 184 |
+
for i in range(n_jobs))
|
| 185 |
+
|
| 186 |
+
# recover the results
|
| 187 |
+
self.U_l = np.zeros([n_samples, n_features])
|
| 188 |
+
self.U_r = np.zeros([n_samples, n_features])
|
| 189 |
+
|
| 190 |
+
for i in range(n_jobs):
|
| 191 |
+
self.U_l[:, starts[i]:starts[i + 1]] = all_results[i][0]
|
| 192 |
+
self.U_r[:, starts[i]:starts[i + 1]] = all_results[i][1]
|
| 193 |
+
|
| 194 |
+
self.U_l = -1 * np.log(self.U_l)
|
| 195 |
+
self.U_r = -1 * np.log(self.U_r)
|
| 196 |
+
|
| 197 |
+
skewness = np.sign(skew(X, axis=0))
|
| 198 |
+
self.U_skew = self.U_l * -1 * np.sign(
|
| 199 |
+
skewness - 1) + self.U_r * np.sign(skewness + 1)
|
| 200 |
+
self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
|
| 201 |
+
if hasattr(self, 'X_train'):
|
| 202 |
+
decision_scores_ = self.O.sum(axis=1)[-original_size:]
|
| 203 |
+
else:
|
| 204 |
+
decision_scores_ = self.O.sum(axis=1)
|
| 205 |
+
return decision_scores_.ravel()
|
models/Chronos.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This function is adapted from [chronos-forecasting] by [lostella et al.]
|
| 3 |
+
Original source: [https://github.com/amazon-science/chronos-forecasting]
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from autogluon.timeseries import TimeSeriesPredictor
|
| 7 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import tempfile
|
| 11 |
+
|
| 12 |
+
from .base import BaseDetector
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class Chronos(BaseDetector):
|
| 16 |
+
def __init__(self,
|
| 17 |
+
win_size=100,
|
| 18 |
+
model_size = 'base', # [tiny, small, base]
|
| 19 |
+
prediction_length=1,
|
| 20 |
+
input_c=1,
|
| 21 |
+
batch_size=128):
|
| 22 |
+
|
| 23 |
+
self.model_name = 'Chronos'
|
| 24 |
+
self.model_size = model_size
|
| 25 |
+
self.win_size = win_size
|
| 26 |
+
self.prediction_length = prediction_length
|
| 27 |
+
self.input_c = input_c
|
| 28 |
+
self.batch_size = batch_size
|
| 29 |
+
self.score_list = []
|
| 30 |
+
|
| 31 |
+
def fit(self, data):
|
| 32 |
+
|
| 33 |
+
for channel in range(self.input_c):
|
| 34 |
+
|
| 35 |
+
data_channel = data[:, channel].reshape(-1, 1)
|
| 36 |
+
data_win, data_target = self.create_dataset(data_channel, slidingWindow=self.win_size, predict_time_steps=self.prediction_length)
|
| 37 |
+
# print('data_win: ', data_win.shape) # (2330, 100)
|
| 38 |
+
# print('data_target: ', data_target.shape) # (2330, 1)
|
| 39 |
+
|
| 40 |
+
train_data = []
|
| 41 |
+
count = 0
|
| 42 |
+
for id in range(data_win.shape[0]):
|
| 43 |
+
for tt in range(data_win.shape[1]):
|
| 44 |
+
train_data.append([id, count, data_win[id, tt]])
|
| 45 |
+
count += 1
|
| 46 |
+
train_data = pd.DataFrame(train_data, columns=['item_id', 'timestamp', 'target'])
|
| 47 |
+
|
| 48 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 49 |
+
|
| 50 |
+
predictor = TimeSeriesPredictor(prediction_length=self.prediction_length, path=temp_dir).fit(
|
| 51 |
+
train_data,
|
| 52 |
+
hyperparameters={
|
| 53 |
+
"Chronos": {
|
| 54 |
+
"model_path": self.model_size, # base
|
| 55 |
+
"device": "cuda",
|
| 56 |
+
"batch_size": self.batch_size}},
|
| 57 |
+
skip_model_selection=True,
|
| 58 |
+
verbosity=0)
|
| 59 |
+
|
| 60 |
+
predictions = predictor.predict(train_data)['mean'].to_numpy().reshape(-1, self.prediction_length)
|
| 61 |
+
print('predictions: ', predictions.shape)
|
| 62 |
+
|
| 63 |
+
### using mse as the anomaly score
|
| 64 |
+
scores = (data_target.squeeze() - predictions.squeeze()) ** 2
|
| 65 |
+
self.score_list.append(scores)
|
| 66 |
+
|
| 67 |
+
scores_merge = np.mean(np.array(self.score_list), axis=0)
|
| 68 |
+
# print('scores_merge: ', scores_merge.shape)
|
| 69 |
+
|
| 70 |
+
padded_decision_scores = np.zeros(len(data))
|
| 71 |
+
padded_decision_scores[: self.win_size+self.prediction_length-1] = scores_merge[0]
|
| 72 |
+
padded_decision_scores[self.win_size+self.prediction_length-1 : ]=scores_merge
|
| 73 |
+
|
| 74 |
+
self.decision_scores_ = padded_decision_scores
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def decision_function(self, X):
|
| 78 |
+
"""
|
| 79 |
+
Not used, present for API consistency by convention.
|
| 80 |
+
"""
|
| 81 |
+
pass
|
| 82 |
+
|
| 83 |
+
def create_dataset(self, X, slidingWindow, predict_time_steps=1):
|
| 84 |
+
Xs, ys = [], []
|
| 85 |
+
for i in range(len(X) - slidingWindow - predict_time_steps+1):
|
| 86 |
+
|
| 87 |
+
tmp = X[i : i + slidingWindow + predict_time_steps].ravel()
|
| 88 |
+
# tmp= MinMaxScaler(feature_range=(0,1)).fit_transform(tmp.reshape(-1,1)).ravel()
|
| 89 |
+
|
| 90 |
+
x = tmp[:slidingWindow]
|
| 91 |
+
y = tmp[slidingWindow:]
|
| 92 |
+
Xs.append(x)
|
| 93 |
+
ys.append(y)
|
| 94 |
+
return np.array(Xs), np.array(ys)
|
models/DADA.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
import torch.optim as optim
|
| 6 |
+
from torch.utils.data import DataLoader, TensorDataset
|
| 7 |
+
import math
|
| 8 |
+
import tqdm
|
| 9 |
+
import os
|
| 10 |
+
from transformers import AutoTokenizer
|
| 11 |
+
from typing import Optional, Tuple
|
| 12 |
+
|
| 13 |
+
# Add debugging prints to understand the import issue
|
| 14 |
+
import sys
|
| 15 |
+
# print(f"Python path: {sys.path}")
|
| 16 |
+
# print(f"Current working directory: {os.getcwd()}")
|
| 17 |
+
# print(f"Current file location: {__file__}")
|
| 18 |
+
# print(f"Current file directory: {os.path.dirname(__file__)}")
|
| 19 |
+
#
|
| 20 |
+
# # Check if the utils directory exists
|
| 21 |
+
# utils_path = os.path.join(os.path.basename(os.path.dirname(__file__)), "utils")
|
| 22 |
+
# print(f"Utils path: {utils_path}")
|
| 23 |
+
# print(f"Utils directory exists: {os.path.exists(utils_path)}")
|
| 24 |
+
# print(f"Utils directory contents: {os.listdir(utils_path) if os.path.exists(utils_path) else 'Directory not found'}")
|
| 25 |
+
#
|
| 26 |
+
# # Check if dataset.py exists
|
| 27 |
+
# dataset_path = os.path.join(utils_path, "dataset.py")
|
| 28 |
+
# print(f"Dataset file path: {dataset_path}")
|
| 29 |
+
# print(f"Dataset file exists: {os.path.exists(dataset_path)}")
|
| 30 |
+
|
| 31 |
+
# Try different import approaches
|
| 32 |
+
|
| 33 |
+
os.chdir("/home/lihaoyang/Huawei/TSB-AD/TSB_AD")
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
from utils.dataset import ReconstructDataset
|
| 37 |
+
print("Relative import successful")
|
| 38 |
+
except ImportError as e:
|
| 39 |
+
print(f"Relative import failed: {e}")
|
| 40 |
+
|
| 41 |
+
# Try absolute import
|
| 42 |
+
try:
|
| 43 |
+
from TSB_AD.utils.dataset import ReconstructDataset
|
| 44 |
+
print("Absolute import successful")
|
| 45 |
+
except ImportError as e2:
|
| 46 |
+
print(f"Absolute import failed: {e2}")
|
| 47 |
+
|
| 48 |
+
# Try adding parent directory to path
|
| 49 |
+
try:
|
| 50 |
+
parent_dir = os.path.dirname(os.path.dirname(__file__))
|
| 51 |
+
if parent_dir not in sys.path:
|
| 52 |
+
sys.path.insert(0, parent_dir)
|
| 53 |
+
from utils.dataset import ReconstructDataset
|
| 54 |
+
print("Import with modified path successful")
|
| 55 |
+
except ImportError as e3:
|
| 56 |
+
print(f"Import with modified path failed: {e3}")
|
| 57 |
+
|
| 58 |
+
from .base import BaseDetector
|
| 59 |
+
|
| 60 |
+
# ...existing code...
|
| 61 |
+
|
| 62 |
+
class DADA(BaseDetector):
|
| 63 |
+
def __init__(self, device, args=None, win_size=64, batch_size=32):
|
| 64 |
+
self.win_size = win_size
|
| 65 |
+
self.batch_size = batch_size
|
| 66 |
+
self.device = torch.device(f'cuda:{device}' if torch.cuda.is_available() else 'cpu')
|
| 67 |
+
self.model = self._build_model().to(self.device)
|
| 68 |
+
|
| 69 |
+
def _build_model(self):
|
| 70 |
+
from transformers import AutoModel, AutoConfig
|
| 71 |
+
import os
|
| 72 |
+
|
| 73 |
+
# Try multiple possible paths
|
| 74 |
+
possible_paths = [
|
| 75 |
+
os.environ.get("DADA_MODEL_PATH"), # Environment variable
|
| 76 |
+
"/home/lihaoyang/Huawei/DADA/DADA/", # Original Linux path
|
| 77 |
+
"./DADA", # Relative path
|
| 78 |
+
"DADA" # Hugging Face model name
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
for path in possible_paths:
|
| 82 |
+
if path is None:
|
| 83 |
+
continue
|
| 84 |
+
try:
|
| 85 |
+
# Try loading config first
|
| 86 |
+
config = AutoConfig.from_pretrained(path, trust_remote_code=True)
|
| 87 |
+
model = AutoModel.from_pretrained(path, config=config, trust_remote_code=True)
|
| 88 |
+
print(f"Successfully loaded DADA model from: {path}")
|
| 89 |
+
return model
|
| 90 |
+
except Exception as e:
|
| 91 |
+
print(f"Failed to load from {path}: {e}")
|
| 92 |
+
continue
|
| 93 |
+
|
| 94 |
+
raise ValueError("DADA model not found. Please set DADA_MODEL_PATH environment variable or ensure the model is available at one of the expected locations.")
|
| 95 |
+
|
| 96 |
+
# def _acquire_device(self):
|
| 97 |
+
# if True:
|
| 98 |
+
# os.environ["CUDA_VISIBLE_DEVICES"] = str(
|
| 99 |
+
# self.args.gpu) if not self.args.use_multi_gpu else self.args.devices
|
| 100 |
+
# device = torch.device('cuda:{}'.format(self.args.gpu))
|
| 101 |
+
# print('Use GPU: cuda:{}'.format(self.args.gpu))
|
| 102 |
+
# else:
|
| 103 |
+
# device = torch.device('cpu')
|
| 104 |
+
# print('Use CPU')
|
| 105 |
+
# return device
|
| 106 |
+
|
| 107 |
+
def decision_function(self, x: torch.Tensor) -> torch.Tensor:
|
| 108 |
+
pass
|
| 109 |
+
|
| 110 |
+
def fit(self, data: torch.Tensor, labels: Optional[torch.Tensor] = None) -> None:
|
| 111 |
+
pass
|
| 112 |
+
|
| 113 |
+
def zero_shot(self, data):
|
| 114 |
+
|
| 115 |
+
test_loader = DataLoader(
|
| 116 |
+
dataset= ReconstructDataset(data, window_size=self.win_size, stride=self.win_size, normalize=True),
|
| 117 |
+
batch_size=self.batch_size,
|
| 118 |
+
shuffle=False)
|
| 119 |
+
|
| 120 |
+
loop = tqdm.tqdm(enumerate(test_loader),total=len(test_loader),leave=True)
|
| 121 |
+
|
| 122 |
+
test_scores = []
|
| 123 |
+
test_labels = []
|
| 124 |
+
self.model.eval()
|
| 125 |
+
self.model.to(self.device)
|
| 126 |
+
|
| 127 |
+
with torch.no_grad():
|
| 128 |
+
for i, (batch_x, batch_y) in loop:
|
| 129 |
+
batch_x = batch_x.float().to(self.device)
|
| 130 |
+
score = self.model.infer(batch_x, norm=0)
|
| 131 |
+
score = score.detach().cpu().numpy()
|
| 132 |
+
test_scores.append(score)
|
| 133 |
+
test_labels.append(batch_y)
|
| 134 |
+
|
| 135 |
+
test_scores = np.concatenate(test_scores, axis=0).reshape(-1, 1)
|
| 136 |
+
test_labels = np.concatenate(test_labels, axis=0).reshape(-1, 1)
|
| 137 |
+
|
| 138 |
+
print("Test scores shape:", test_scores.shape)
|
| 139 |
+
print("Test labels shape:", test_labels.shape)
|
| 140 |
+
|
| 141 |
+
return test_scores.reshape(-1)
|
models/Donut.py
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This function is adapted from [donut] by [haowen-xu]
|
| 3 |
+
Original source: [https://github.com/NetManAIOps/donut]
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torchinfo
|
| 9 |
+
import torch
|
| 10 |
+
from torch import nn, optim
|
| 11 |
+
import tqdm
|
| 12 |
+
import os, math
|
| 13 |
+
import torch.nn.functional as F
|
| 14 |
+
from torch.utils.data import DataLoader
|
| 15 |
+
from typing import Tuple, Sequence, Union, Callable
|
| 16 |
+
|
| 17 |
+
from ..utils.torch_utility import EarlyStoppingTorch, get_gpu
|
| 18 |
+
from ..utils.dataset import ReconstructDataset
|
| 19 |
+
|
| 20 |
+
class DonutModel(nn.Module):
|
| 21 |
+
def __init__(self, input_dim, hidden_dim, latent_dim, mask_prob) -> None:
|
| 22 |
+
super().__init__()
|
| 23 |
+
|
| 24 |
+
"""
|
| 25 |
+
Xu2018
|
| 26 |
+
|
| 27 |
+
:param input_dim: Should be window_size * features
|
| 28 |
+
:param hidden_dims:
|
| 29 |
+
:param latent_dim:
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
self.latent_dim = latent_dim
|
| 33 |
+
self.mask_prob = mask_prob
|
| 34 |
+
|
| 35 |
+
encoder = VaeEncoder(input_dim, hidden_dim, latent_dim)
|
| 36 |
+
decoder = VaeEncoder(latent_dim, hidden_dim, input_dim)
|
| 37 |
+
|
| 38 |
+
self.vae = VAE(encoder=encoder, decoder=decoder, logvar_out=False)
|
| 39 |
+
|
| 40 |
+
def forward(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, ...]:
|
| 41 |
+
# x: (B, T, D)
|
| 42 |
+
x = inputs
|
| 43 |
+
B, T, D = x.shape
|
| 44 |
+
|
| 45 |
+
if self.training:
|
| 46 |
+
# Randomly mask some inputs
|
| 47 |
+
mask = torch.empty_like(x)
|
| 48 |
+
mask.bernoulli_(1 - self.mask_prob)
|
| 49 |
+
x = x * mask
|
| 50 |
+
else:
|
| 51 |
+
mask = None
|
| 52 |
+
|
| 53 |
+
# Run the VAE
|
| 54 |
+
x = x.view(x.shape[0], -1)
|
| 55 |
+
mean_z, std_z, mean_x, std_x, sample_z = self.vae(x, return_latent_sample=True)
|
| 56 |
+
|
| 57 |
+
# Reshape the outputs
|
| 58 |
+
mean_x = mean_x.view(B, T, D)
|
| 59 |
+
std_x = std_x.view(B, T, D)
|
| 60 |
+
return mean_z, std_z, mean_x, std_x, sample_z, mask
|
| 61 |
+
|
| 62 |
+
def sample_normal(mu: torch.Tensor, std_or_log_var: torch.Tensor, log_var: bool = False, num_samples: int = 1):
|
| 63 |
+
# ln(σ) = 0.5 * ln(σ^2) -> σ = e^(0.5 * ln(σ^2))
|
| 64 |
+
if log_var:
|
| 65 |
+
sigma = std_or_log_var.mul(0.5).exp_()
|
| 66 |
+
else:
|
| 67 |
+
sigma = std_or_log_var
|
| 68 |
+
|
| 69 |
+
if num_samples == 1:
|
| 70 |
+
eps = torch.randn_like(mu) # also copies device from mu
|
| 71 |
+
else:
|
| 72 |
+
eps = torch.rand((num_samples,) + mu.shape, dtype=mu.dtype, device=mu.device)
|
| 73 |
+
mu = mu.unsqueeze(0)
|
| 74 |
+
sigma = sigma.unsqueeze(0)
|
| 75 |
+
# z = μ + σ * ϵ, with ϵ ~ N(0,I)
|
| 76 |
+
return eps.mul(sigma).add_(mu)
|
| 77 |
+
|
| 78 |
+
def normal_standard_normal_kl(mean: torch.Tensor, std_or_log_var: torch.Tensor, log_var: bool = False) -> torch.Tensor:
|
| 79 |
+
if log_var:
|
| 80 |
+
kl_loss = torch.sum(1 + std_or_log_var - mean.pow(2) - std_or_log_var.exp(), dim=-1)
|
| 81 |
+
else:
|
| 82 |
+
kl_loss = torch.sum(1 + torch.log(std_or_log_var.pow(2)) - mean.pow(2) - std_or_log_var.pow(2), dim=-1)
|
| 83 |
+
return -0.5 * kl_loss
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def normal_normal_kl(mean_1: torch.Tensor, std_or_log_var_1: torch.Tensor, mean_2: torch.Tensor,
|
| 87 |
+
std_or_log_var_2: torch.Tensor, log_var: bool = False) -> torch.Tensor:
|
| 88 |
+
if log_var:
|
| 89 |
+
return 0.5 * torch.sum(std_or_log_var_2 - std_or_log_var_1 + (torch.exp(std_or_log_var_1)
|
| 90 |
+
+ (mean_1 - mean_2)**2) / torch.exp(std_or_log_var_2) - 1, dim=-1)
|
| 91 |
+
|
| 92 |
+
return torch.sum(torch.log(std_or_log_var_2) - torch.log(std_or_log_var_1) \
|
| 93 |
+
+ 0.5 * (std_or_log_var_1**2 + (mean_1 - mean_2)**2) / std_or_log_var_2**2 - 0.5, dim=-1)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class VAELoss(torch.nn.modules.loss._Loss):
|
| 97 |
+
def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', logvar_out: bool = True):
|
| 98 |
+
super(VAELoss, self).__init__(size_average, reduce, reduction)
|
| 99 |
+
self.logvar_out = logvar_out
|
| 100 |
+
|
| 101 |
+
def forward(self, predictions: Tuple[torch.Tensor, ...], targets: Tuple[torch.Tensor, ...], *args, **kwargs) \
|
| 102 |
+
-> torch.Tensor:
|
| 103 |
+
z_mean, z_std_or_log_var, x_dec_mean, x_dec_std = predictions[:4]
|
| 104 |
+
if len(predictions) > 4:
|
| 105 |
+
z_prior_mean, z_prior_std_or_logvar = predictions[4:]
|
| 106 |
+
else:
|
| 107 |
+
z_prior_mean, z_prior_std_or_logvar = None, None
|
| 108 |
+
|
| 109 |
+
y, = targets
|
| 110 |
+
|
| 111 |
+
# Gaussian nnl loss assumes multivariate normal with diagonal sigma
|
| 112 |
+
# Alternatively we can use torch.distribution.Normal(x_dec_mean, x_dec_std).log_prob(y).sum(-1)
|
| 113 |
+
# or torch.distribution.MultivariateNormal(mean, cov).log_prob(y).sum(-1)
|
| 114 |
+
# with cov = torch.eye(feat_dim).repeat([1,bz,1,1])*std.pow(2).unsqueeze(-1).
|
| 115 |
+
# However setting up a distribution seems to be an unnecessary computational overhead.
|
| 116 |
+
# However, this requires pytorch version > 1.9!!!
|
| 117 |
+
nll_gauss = F.gaussian_nll_loss(x_dec_mean, y, x_dec_std.pow(2), reduction='none').sum(-1)
|
| 118 |
+
# For pytorch version < 1.9 use:
|
| 119 |
+
# nll_gauss = -torch.distribution.Normal(x_dec_mean, x_dec_std).log_prob(y).sum(-1)
|
| 120 |
+
|
| 121 |
+
# get KL loss
|
| 122 |
+
if z_prior_mean is None and z_prior_std_or_logvar is None:
|
| 123 |
+
# If a prior is not given, we assume standard normal
|
| 124 |
+
kl_loss = normal_standard_normal_kl(z_mean, z_std_or_log_var, log_var=self.logvar_out)
|
| 125 |
+
else:
|
| 126 |
+
if z_prior_mean is None:
|
| 127 |
+
z_prior_mean = torch.tensor(0, dtype=z_mean.dtype, device=z_mean.device)
|
| 128 |
+
if z_prior_std_or_logvar is None:
|
| 129 |
+
value = 0 if self.logvar_out else 1
|
| 130 |
+
z_prior_std_or_logvar = torch.tensor(value, dtype=z_std_or_log_var.dtype, device=z_std_or_log_var.device)
|
| 131 |
+
|
| 132 |
+
kl_loss = normal_normal_kl(z_mean, z_std_or_log_var, z_prior_mean, z_prior_std_or_logvar,
|
| 133 |
+
log_var=self.logvar_out)
|
| 134 |
+
|
| 135 |
+
# Combine
|
| 136 |
+
final_loss = nll_gauss + kl_loss
|
| 137 |
+
|
| 138 |
+
if self.reduction == 'none':
|
| 139 |
+
return final_loss
|
| 140 |
+
elif self.reduction == 'mean':
|
| 141 |
+
return torch.mean(final_loss)
|
| 142 |
+
elif self.reduction == 'sum':
|
| 143 |
+
return torch.sum(final_loss)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class MaskedVAELoss(VAELoss):
|
| 147 |
+
def __init__(self, size_average=None, reduce=None, reduction: str = 'mean'):
|
| 148 |
+
super(MaskedVAELoss, self).__init__(size_average, reduce, reduction, logvar_out=False)
|
| 149 |
+
|
| 150 |
+
def forward(self, predictions: Tuple[torch.Tensor, ...], targets: Tuple[torch.Tensor, ...], *args, **kwargs) \
|
| 151 |
+
-> torch.Tensor:
|
| 152 |
+
mean_z, std_z, mean_x, std_x, sample_z, mask = predictions
|
| 153 |
+
actual_x, = targets
|
| 154 |
+
|
| 155 |
+
if mask is None:
|
| 156 |
+
mean_z = mean_z.unsqueeze(1)
|
| 157 |
+
std_z = std_z.unsqueeze(1)
|
| 158 |
+
return super(MaskedVAELoss, self).forward((mean_z, std_z, mean_x, std_x), (actual_x,), *args, **kwargs)
|
| 159 |
+
|
| 160 |
+
# If the loss is masked, one of the terms in the kl loss is weighted, so we can't compute it exactly
|
| 161 |
+
# anymore and have to use a MC approximation like for the output likelihood
|
| 162 |
+
nll_output = torch.sum(mask * F.gaussian_nll_loss(mean_x, actual_x, std_x**2, reduction='none'), dim=-1)
|
| 163 |
+
|
| 164 |
+
# This is p(z), i.e., the prior likelihood of Z. The paper assumes p(z) = N(z| 0, I), we drop constants
|
| 165 |
+
beta = torch.mean(mask, dim=(1, 2)).unsqueeze(-1)
|
| 166 |
+
nll_prior = beta * 0.5 * torch.sum(sample_z * sample_z, dim=-1, keepdim=True)
|
| 167 |
+
|
| 168 |
+
nll_approx = torch.sum(F.gaussian_nll_loss(mean_z, sample_z, std_z**2, reduction='none'), dim=-1, keepdim=True)
|
| 169 |
+
|
| 170 |
+
final_loss = nll_output + nll_prior - nll_approx
|
| 171 |
+
|
| 172 |
+
if self.reduction == 'none':
|
| 173 |
+
return final_loss
|
| 174 |
+
elif self.reduction == 'mean':
|
| 175 |
+
return torch.mean(final_loss)
|
| 176 |
+
elif self.reduction == 'sum':
|
| 177 |
+
return torch.sum(final_loss)
|
| 178 |
+
|
| 179 |
+
class MLP(torch.nn.Module):
|
| 180 |
+
def __init__(self, input_features: int, hidden_layers: Union[int, Sequence[int]], output_features: int,
|
| 181 |
+
activation: Callable = torch.nn.Identity(), activation_after_last_layer: bool = False):
|
| 182 |
+
super(MLP, self).__init__()
|
| 183 |
+
|
| 184 |
+
self.activation = activation
|
| 185 |
+
self.activation_after_last_layer = activation_after_last_layer
|
| 186 |
+
|
| 187 |
+
if isinstance(hidden_layers, int):
|
| 188 |
+
hidden_layers = [hidden_layers]
|
| 189 |
+
|
| 190 |
+
layers = [input_features] + list(hidden_layers) + [output_features]
|
| 191 |
+
self.layers = torch.nn.ModuleList([torch.nn.Linear(inp, out) for inp, out in zip(layers[:-1], layers[1:])])
|
| 192 |
+
|
| 193 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 194 |
+
out = x
|
| 195 |
+
for layer in self.layers[:-1]:
|
| 196 |
+
out = layer(out)
|
| 197 |
+
out = self.activation(out)
|
| 198 |
+
|
| 199 |
+
out = self.layers[-1](out)
|
| 200 |
+
if self.activation_after_last_layer:
|
| 201 |
+
out = self.activation(out)
|
| 202 |
+
|
| 203 |
+
return out
|
| 204 |
+
|
| 205 |
+
class VaeEncoder(nn.Module):
|
| 206 |
+
def __init__(self, input_dim: int, hidden_dim: int, latent_dim: int):
|
| 207 |
+
super(VaeEncoder, self).__init__()
|
| 208 |
+
|
| 209 |
+
self.latent_dim = latent_dim
|
| 210 |
+
|
| 211 |
+
self.mlp = MLP(input_dim, hidden_dim, 2*latent_dim, activation=torch.nn.ReLU(), activation_after_last_layer=False)
|
| 212 |
+
self.softplus = torch.nn.Softplus()
|
| 213 |
+
|
| 214 |
+
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 215 |
+
# x: (B, T, D)
|
| 216 |
+
mlp_out = self.mlp(x)
|
| 217 |
+
|
| 218 |
+
mean, std = mlp_out.tensor_split(2, dim=-1)
|
| 219 |
+
std = self.softplus(std)
|
| 220 |
+
|
| 221 |
+
return mean, std
|
| 222 |
+
|
| 223 |
+
class VAE(torch.nn.Module):
|
| 224 |
+
"""
|
| 225 |
+
VAE Implementation that supports normal distribution with diagonal cov matrix in the latent space
|
| 226 |
+
and the output
|
| 227 |
+
"""
|
| 228 |
+
|
| 229 |
+
def __init__(self, encoder: torch.nn.Module, decoder: torch.nn.Module, logvar_out: bool = True):
|
| 230 |
+
super(VAE, self).__init__()
|
| 231 |
+
|
| 232 |
+
self.encoder = encoder
|
| 233 |
+
self.decoder = decoder
|
| 234 |
+
self.log_var = logvar_out
|
| 235 |
+
|
| 236 |
+
def forward(self, x: torch.Tensor, return_latent_sample: bool = False, num_samples: int = 1,
|
| 237 |
+
force_sample: bool = False) -> Tuple[torch.Tensor, ...]:
|
| 238 |
+
z_mu, z_std_or_log_var = self.encoder(x)
|
| 239 |
+
|
| 240 |
+
if self.training or num_samples > 1 or force_sample:
|
| 241 |
+
z_sample = sample_normal(z_mu, z_std_or_log_var, log_var=self.log_var, num_samples=num_samples)
|
| 242 |
+
else:
|
| 243 |
+
z_sample = z_mu
|
| 244 |
+
|
| 245 |
+
x_dec_mean, x_dec_std = self.decoder(z_sample)
|
| 246 |
+
|
| 247 |
+
if not return_latent_sample:
|
| 248 |
+
return z_mu, z_std_or_log_var, x_dec_mean, x_dec_std
|
| 249 |
+
|
| 250 |
+
return z_mu, z_std_or_log_var, x_dec_mean, x_dec_std, z_sample
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
class Donut():
|
| 255 |
+
def __init__(self,
|
| 256 |
+
win_size=120,
|
| 257 |
+
input_c=1,
|
| 258 |
+
batch_size=128, # 32, 128
|
| 259 |
+
grad_clip=10.0,
|
| 260 |
+
num_epochs=50,
|
| 261 |
+
mc_samples=1024,
|
| 262 |
+
hidden_dim=100,
|
| 263 |
+
latent_dim=8,
|
| 264 |
+
inject_ratio=0.01,
|
| 265 |
+
lr=1e-4,
|
| 266 |
+
l2_coff=1e-3,
|
| 267 |
+
patience=3,
|
| 268 |
+
validation_size=0):
|
| 269 |
+
super().__init__()
|
| 270 |
+
self.__anomaly_score = None
|
| 271 |
+
|
| 272 |
+
self.cuda = True
|
| 273 |
+
self.device = get_gpu(self.cuda)
|
| 274 |
+
|
| 275 |
+
self.win_size = win_size
|
| 276 |
+
self.input_c = input_c
|
| 277 |
+
self.batch_size = batch_size
|
| 278 |
+
self.grad_clip = grad_clip
|
| 279 |
+
self.num_epochs = num_epochs
|
| 280 |
+
self.mc_samples = mc_samples
|
| 281 |
+
self.validation_size = validation_size
|
| 282 |
+
|
| 283 |
+
input_dim = self.win_size*self.input_c
|
| 284 |
+
|
| 285 |
+
self.model = DonutModel(input_dim=input_dim, hidden_dim=hidden_dim, latent_dim=latent_dim, mask_prob=inject_ratio).to(self.device)
|
| 286 |
+
self.optimizer = optim.AdamW(self.model.parameters(), lr=lr, weight_decay=l2_coff)
|
| 287 |
+
self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=10, gamma=0.75)
|
| 288 |
+
self.vaeloss = MaskedVAELoss()
|
| 289 |
+
|
| 290 |
+
self.save_path = None
|
| 291 |
+
self.early_stopping = EarlyStoppingTorch(save_path=self.save_path, patience=patience)
|
| 292 |
+
|
| 293 |
+
def train(self, train_loader, epoch):
|
| 294 |
+
self.model.train(mode=True)
|
| 295 |
+
avg_loss = 0
|
| 296 |
+
loop = tqdm.tqdm(enumerate(train_loader),total=len(train_loader),leave=True)
|
| 297 |
+
for idx, (x, target) in loop:
|
| 298 |
+
x, target = x.to(self.device), target.to(self.device)
|
| 299 |
+
self.optimizer.zero_grad()
|
| 300 |
+
|
| 301 |
+
# print('x: ', x.shape)
|
| 302 |
+
|
| 303 |
+
output = self.model(x)
|
| 304 |
+
loss = self.vaeloss(output, (target,))
|
| 305 |
+
loss.backward()
|
| 306 |
+
|
| 307 |
+
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
|
| 308 |
+
self.optimizer.step()
|
| 309 |
+
|
| 310 |
+
avg_loss += loss.cpu().item()
|
| 311 |
+
loop.set_description(f'Training Epoch [{epoch}/{self.num_epochs}]')
|
| 312 |
+
loop.set_postfix(loss=loss.item(), avg_loss=avg_loss/(idx+1))
|
| 313 |
+
|
| 314 |
+
return avg_loss/max(len(train_loader), 1)
|
| 315 |
+
|
| 316 |
+
def valid(self, valid_loader, epoch):
|
| 317 |
+
self.model.eval()
|
| 318 |
+
avg_loss = 0
|
| 319 |
+
loop = tqdm.tqdm(enumerate(valid_loader),total=len(valid_loader),leave=True)
|
| 320 |
+
with torch.no_grad():
|
| 321 |
+
for idx, (x, target) in loop:
|
| 322 |
+
x, target = x.to(self.device), target.to(self.device)
|
| 323 |
+
output = self.model(x)
|
| 324 |
+
loss = self.vaeloss(output, (target,))
|
| 325 |
+
avg_loss += loss.cpu().item()
|
| 326 |
+
loop.set_description(f'Validation Epoch [{epoch}/{self.num_epochs}]')
|
| 327 |
+
loop.set_postfix(loss=loss.item(), avg_loss=avg_loss/(idx+1))
|
| 328 |
+
|
| 329 |
+
return avg_loss/max(len(valid_loader), 1)
|
| 330 |
+
|
| 331 |
+
def fit(self, data):
|
| 332 |
+
tsTrain = data[:int((1-self.validation_size)*len(data))]
|
| 333 |
+
tsValid = data[int((1-self.validation_size)*len(data)):]
|
| 334 |
+
|
| 335 |
+
train_loader = DataLoader(
|
| 336 |
+
dataset=ReconstructDataset(tsTrain, window_size=self.win_size),
|
| 337 |
+
batch_size=self.batch_size,
|
| 338 |
+
shuffle=True
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
valid_loader = DataLoader(
|
| 342 |
+
dataset=ReconstructDataset(tsValid, window_size=self.win_size),
|
| 343 |
+
batch_size=self.batch_size,
|
| 344 |
+
shuffle=False
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
for epoch in range(1, self.num_epochs + 1):
|
| 348 |
+
train_loss = self.train(train_loader, epoch)
|
| 349 |
+
if len(valid_loader) > 0:
|
| 350 |
+
valid_loss = self.valid(valid_loader, epoch)
|
| 351 |
+
self.scheduler.step()
|
| 352 |
+
|
| 353 |
+
if len(valid_loader) > 0:
|
| 354 |
+
self.early_stopping(valid_loss, self.model)
|
| 355 |
+
else:
|
| 356 |
+
self.early_stopping(train_loss, self.model)
|
| 357 |
+
if self.early_stopping.early_stop:
|
| 358 |
+
print(" Early stopping<<<")
|
| 359 |
+
break
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
def decision_function(self, data):
|
| 363 |
+
|
| 364 |
+
test_loader = DataLoader(
|
| 365 |
+
dataset=ReconstructDataset(data, window_size=self.win_size),
|
| 366 |
+
batch_size=self.batch_size,
|
| 367 |
+
shuffle=False
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
self.model.eval()
|
| 371 |
+
scores = []
|
| 372 |
+
loop = tqdm.tqdm(enumerate(test_loader),total=len(test_loader),leave=True)
|
| 373 |
+
with torch.no_grad():
|
| 374 |
+
for idx, (x, _) in loop:
|
| 375 |
+
x = x.to(self.device)
|
| 376 |
+
x_vae = x.view(x.shape[0], -1)
|
| 377 |
+
B, T, D = x.shape
|
| 378 |
+
|
| 379 |
+
res = self.model.vae(x_vae, return_latent_sample=False, num_samples=self.mc_samples)
|
| 380 |
+
z_mu, z_std, x_dec_mean, x_dec_std = res
|
| 381 |
+
|
| 382 |
+
x_dec_mean = x_dec_mean.view(self.mc_samples, B, T, D)
|
| 383 |
+
x_dec_std = x_dec_std.view(self.mc_samples, B, T, D)
|
| 384 |
+
nll_output = torch.sum(F.gaussian_nll_loss(x_dec_mean[:, :, -1, :], x[:, -1, :].unsqueeze(0),
|
| 385 |
+
x_dec_std[:, :, -1, :]**2, reduction='none'), dim=(0, 2))
|
| 386 |
+
nll_output /= self.mc_samples
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
scores.append(nll_output.cpu())
|
| 390 |
+
loop.set_description(f'Testing: ')
|
| 391 |
+
|
| 392 |
+
scores = torch.cat(scores, dim=0)
|
| 393 |
+
scores = scores.numpy()
|
| 394 |
+
|
| 395 |
+
assert scores.ndim == 1
|
| 396 |
+
|
| 397 |
+
import shutil
|
| 398 |
+
if self.save_path and os.path.exists(self.save_path):
|
| 399 |
+
shutil.rmtree(self.save_path)
|
| 400 |
+
|
| 401 |
+
self.__anomaly_score = scores
|
| 402 |
+
|
| 403 |
+
if self.__anomaly_score.shape[0] < len(data):
|
| 404 |
+
self.__anomaly_score = np.array([self.__anomaly_score[0]]*math.ceil((self.win_size-1)/2) +
|
| 405 |
+
list(self.__anomaly_score) + [self.__anomaly_score[-1]]*((self.win_size-1)//2))
|
| 406 |
+
|
| 407 |
+
return self.__anomaly_score
|
| 408 |
+
|
| 409 |
+
def anomaly_score(self) -> np.ndarray:
|
| 410 |
+
return self.__anomaly_score
|
| 411 |
+
|
| 412 |
+
def get_y_hat(self) -> np.ndarray:
|
| 413 |
+
return super().get_y_hat
|
| 414 |
+
|
| 415 |
+
def param_statistic(self, save_file):
|
| 416 |
+
model_stats = torchinfo.summary(self.model, (self.batch_size, self.win_size), verbose=0)
|
| 417 |
+
with open(save_file, 'w') as f:
|
| 418 |
+
f.write(str(model_stats))
|
| 419 |
+
|