diff --git "a/pycaret_outlier_detection.ipynb" "b/pycaret_outlier_detection.ipynb" new file mode 100644--- /dev/null +++ "b/pycaret_outlier_detection.ipynb" @@ -0,0 +1,3893 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import tqdm\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns;\n", + "\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.manifold import TSNE\n", + "from pycaret.anomaly import *\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "embeding_df=pd.read_csv('/mnt/c/Users/selin_uzturk/Desktop/sinkaf/encoded.csv')\n", + "embeding_df=embeding_df.drop(['Unnamed: 0'], axis=1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...5657585960616263labelstweet
0101101101757865318925285159764084027631623...000000000en güzel uyuyan insan ödülü jeon jungkook'a g...
11011158910706107131079494698306682488311723763...000000000Mekanı cennet olsun, saygılar sayın avukatımı...
21011483047110774137851377933642143994827176686...000000000Kızlar aranızda kas yığını beylere düşenler ol...
3101193191672410118101077832312407389592293410147...000000000Biraz ders çalışayım. Tembellik ve uyku düşman...
41013093258706580544490710224106583102881252413878...000000000Trezeguet yerine El Sharawy daha iyi olmaz mı
..................................................................
43344101200651016111511510378410774213881024592067...000000001Hil**adamlar kesinlikle kelimeleri anlamıyorla...
4334510113980839241091340618985162851016311062276...000000001Böyle piçlerin çok erken ölmemelerini ve çok f...
43346101105549102635101402694311499110516218991186110561...000000001Turgay denilen bu holigonda bir sorun yok, gur...
433471018142426398920171096201094176010101151983026083...000000001Umarım ülkenin düşük zekadan kurtulması ilgile...
433481013977411127459892459611933170171451071039125...000000001CHP sandıkları bırakmaz, üzerine oturur, bir c...
\n", + "

43349 rows × 66 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 \\\n", + "0 101 10110 175 78653 189 25285 15976 40840 276 \n", + "1 101 11589 10706 10713 10794 94698 30668 24883 117 \n", + "2 101 148 30471 10774 13785 13779 33642 14399 48271 \n", + "3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n", + "4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "43344 101 20065 10161 115 115 103784 10774 21388 10245 \n", + "43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n", + "43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n", + "43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n", + "43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n", + "\n", + " 9 ... 56 57 58 59 60 61 62 63 labels \\\n", + "0 31623 ... 0 0 0 0 0 0 0 0 0 \n", + "1 23763 ... 0 0 0 0 0 0 0 0 0 \n", + "2 76686 ... 0 0 0 0 0 0 0 0 0 \n", + "3 10147 ... 0 0 0 0 0 0 0 0 0 \n", + "4 13878 ... 0 0 0 0 0 0 0 0 0 \n", + "... ... ... .. .. .. .. .. .. .. .. ... \n", + "43344 92067 ... 0 0 0 0 0 0 0 0 1 \n", + "43345 276 ... 0 0 0 0 0 0 0 0 1 \n", + "43346 10561 ... 0 0 0 0 0 0 0 0 1 \n", + "43347 26083 ... 0 0 0 0 0 0 0 0 1 \n", + "43348 39125 ... 0 0 0 0 0 0 0 0 1 \n", + "\n", + " tweet \n", + "0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n", + "1 Mekanı cennet olsun, saygılar sayın avukatımı... \n", + "2 Kızlar aranızda kas yığını beylere düşenler ol... \n", + "3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n", + "4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n", + "... ... \n", + "43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n", + "43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n", + "43346 Turgay denilen bu holigonda bir sorun yok, gur... \n", + "43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n", + "43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n", + "\n", + "[43349 rows x 66 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embeding_df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 DescriptionValue
0Session id5272
1Original data shape(43349, 66)
2Transformed data shape(43349, 65)
3Ignore features1
4Numeric features65
5PreprocessTrue
6Imputation typesimple
7Numeric imputationmean
8Categorical imputationmode
9CPU Jobs-1
10Use GPUFalse
11Log ExperimentFalse
12Experiment Nameanomaly-default-name
13USIca74
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ano1= setup(embeding_df,ignore_features=['tweet'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameReference
ID
abodAngle-base Outlier Detectionpyod.models.abod.ABOD
clusterClustering-Based Local Outlierpycaret.internal.patches.pyod.CBLOFForceToDouble
cofConnectivity-Based Local Outlierpyod.models.cof.COF
iforestIsolation Forestpyod.models.iforest.IForest
histogramHistogram-based Outlier Detectionpyod.models.hbos.HBOS
knnK-Nearest Neighbors Detectorpyod.models.knn.KNN
lofLocal Outlier Factorpyod.models.lof.LOF
svmOne-class SVM detectorpyod.models.ocsvm.OCSVM
pcaPrincipal Component Analysispyod.models.pca.PCA
mcdMinimum Covariance Determinantpyod.models.mcd.MCD
sodSubspace Outlier Detectionpyod.models.sod.SOD
sosStochastic Outlier Selectionpyod.models.sos.SOS
\n", + "
" + ], + "text/plain": [ + " Name \\\n", + "ID \n", + "abod Angle-base Outlier Detection \n", + "cluster Clustering-Based Local Outlier \n", + "cof Connectivity-Based Local Outlier \n", + "iforest Isolation Forest \n", + "histogram Histogram-based Outlier Detection \n", + "knn K-Nearest Neighbors Detector \n", + "lof Local Outlier Factor \n", + "svm One-class SVM detector \n", + "pca Principal Component Analysis \n", + "mcd Minimum Covariance Determinant \n", + "sod Subspace Outlier Detection \n", + "sos Stochastic Outlier Selection \n", + "\n", + " Reference \n", + "ID \n", + "abod pyod.models.abod.ABOD \n", + "cluster pycaret.internal.patches.pyod.CBLOFForceToDouble \n", + "cof pyod.models.cof.COF \n", + "iforest pyod.models.iforest.IForest \n", + "histogram pyod.models.hbos.HBOS \n", + "knn pyod.models.knn.KNN \n", + "lof pyod.models.lof.LOF \n", + "svm pyod.models.ocsvm.OCSVM \n", + "pca pyod.models.pca.PCA \n", + "mcd pyod.models.mcd.MCD \n", + "sod pyod.models.sod.SOD \n", + "sos pyod.models.sos.SOS " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "models()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# iforest" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# iforest = create_model('iforest')\n", + "# iforest_anomalies = assign_model(iforest)\n", + "# # iso_df=embeding_df.drop(['tweet'], axis=1)\n", + "# iforest_pred = predict_model(iforest, data=iso_df)\n", + "# iforest_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# iforest_pred['Anomaly'].value_counts()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# knn\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...57585960616263labelsAnomalyAnomaly_Score
0101.010110.0175.078653.0189.025285.015976.040840.0276.031623.0...0.00.00.00.00.00.00.00.0070971.171936
1101.011589.010706.010713.010794.094698.030668.024883.0117.023763.0...0.00.00.00.00.00.00.00.0077147.550363
2101.0148.030471.010774.013785.013779.033642.014399.048271.076686.0...0.00.00.00.00.00.00.00.00118676.465801
3101.019319.016724.010118.010107.078323.012407.038959.022934.010147.0...0.00.00.00.00.00.00.00.0094310.765409
4101.030932.058706.058054.044907.010224.0106583.010288.012524.013878.0...0.00.00.00.00.00.00.00.0063569.489655
..................................................................
43344101.020065.010161.0115.0115.0103784.010774.021388.010245.092067.0...0.00.00.00.00.00.00.01.00183310.474995
43345101.0139.080839.024109.013406.018985.016285.010163.011062.0276.0...0.00.00.00.00.00.00.01.00140717.435036
43346101.0105549.0102635.010140.026943.011499.0110516.021899.011861.010561.0...0.00.00.00.00.00.00.01.0098954.428628
43347101.081424.026398.092017.0109620.010941.076010.010115.019830.026083.0...0.00.00.00.00.00.00.01.0065424.117159
43348101.039774.011127.045989.024596.011933.0170.017145.010710.039125.0...0.00.00.00.00.00.00.01.00182332.274049
\n", + "

43349 rows × 67 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 101.0 10110.0 175.0 78653.0 189.0 25285.0 15976.0 \n", + "1 101.0 11589.0 10706.0 10713.0 10794.0 94698.0 30668.0 \n", + "2 101.0 148.0 30471.0 10774.0 13785.0 13779.0 33642.0 \n", + "3 101.0 19319.0 16724.0 10118.0 10107.0 78323.0 12407.0 \n", + "4 101.0 30932.0 58706.0 58054.0 44907.0 10224.0 106583.0 \n", + "... ... ... ... ... ... ... ... \n", + "43344 101.0 20065.0 10161.0 115.0 115.0 103784.0 10774.0 \n", + "43345 101.0 139.0 80839.0 24109.0 13406.0 18985.0 16285.0 \n", + "43346 101.0 105549.0 102635.0 10140.0 26943.0 11499.0 110516.0 \n", + "43347 101.0 81424.0 26398.0 92017.0 109620.0 10941.0 76010.0 \n", + "43348 101.0 39774.0 11127.0 45989.0 24596.0 11933.0 170.0 \n", + "\n", + " 7 8 9 ... 57 58 59 60 61 62 63 \\\n", + "0 40840.0 276.0 31623.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "1 24883.0 117.0 23763.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2 14399.0 48271.0 76686.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "3 38959.0 22934.0 10147.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "4 10288.0 12524.0 13878.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... ... ... ... ... ... ... ... \n", + "43344 21388.0 10245.0 92067.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43345 10163.0 11062.0 276.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43346 21899.0 11861.0 10561.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43347 10115.0 19830.0 26083.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43348 17145.0 10710.0 39125.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " labels Anomaly Anomaly_Score \n", + "0 0.0 0 70971.171936 \n", + "1 0.0 0 77147.550363 \n", + "2 0.0 0 118676.465801 \n", + "3 0.0 0 94310.765409 \n", + "4 0.0 0 63569.489655 \n", + "... ... ... ... \n", + "43344 1.0 0 183310.474995 \n", + "43345 1.0 0 140717.435036 \n", + "43346 1.0 0 98954.428628 \n", + "43347 1.0 0 65424.117159 \n", + "43348 1.0 0 182332.274049 \n", + "\n", + "[43349 rows x 67 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn = create_model('knn')\n", + "knn_anomalies = assign_model(knn)\n", + "knn_df=embeding_df.drop(['tweet'], axis=1)\n", + "knn_pred = predict_model(knn, data=knn_df)\n", + "knn_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 41376\n", + "1 1973\n", + "Name: Anomaly, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_pred['Anomaly'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...57585960616263labelsAnomalyAnomaly_Score
0101.010110.0175.078653.0189.025285.015976.040840.0276.031623.0...0.00.00.00.00.00.00.00.0070971.171936
1101.011589.010706.010713.010794.094698.030668.024883.0117.023763.0...0.00.00.00.00.00.00.00.0077147.550363
2101.0148.030471.010774.013785.013779.033642.014399.048271.076686.0...0.00.00.00.00.00.00.00.00118676.465801
3101.019319.016724.010118.010107.078323.012407.038959.022934.010147.0...0.00.00.00.00.00.00.00.0094310.765409
4101.030932.058706.058054.044907.010224.0106583.010288.012524.013878.0...0.00.00.00.00.00.00.00.0063569.489655
..................................................................
43344101.020065.010161.0115.0115.0103784.010774.021388.010245.092067.0...0.00.00.00.00.00.00.01.00183310.474995
43345101.0139.080839.024109.013406.018985.016285.010163.011062.0276.0...0.00.00.00.00.00.00.01.00140717.435036
43346101.0105549.0102635.010140.026943.011499.0110516.021899.011861.010561.0...0.00.00.00.00.00.00.01.0098954.428628
43347101.081424.026398.092017.0109620.010941.076010.010115.019830.026083.0...0.00.00.00.00.00.00.01.0065424.117159
43348101.039774.011127.045989.024596.011933.0170.017145.010710.039125.0...0.00.00.00.00.00.00.01.00182332.274049
\n", + "

43349 rows × 67 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 101.0 10110.0 175.0 78653.0 189.0 25285.0 15976.0 \n", + "1 101.0 11589.0 10706.0 10713.0 10794.0 94698.0 30668.0 \n", + "2 101.0 148.0 30471.0 10774.0 13785.0 13779.0 33642.0 \n", + "3 101.0 19319.0 16724.0 10118.0 10107.0 78323.0 12407.0 \n", + "4 101.0 30932.0 58706.0 58054.0 44907.0 10224.0 106583.0 \n", + "... ... ... ... ... ... ... ... \n", + "43344 101.0 20065.0 10161.0 115.0 115.0 103784.0 10774.0 \n", + "43345 101.0 139.0 80839.0 24109.0 13406.0 18985.0 16285.0 \n", + "43346 101.0 105549.0 102635.0 10140.0 26943.0 11499.0 110516.0 \n", + "43347 101.0 81424.0 26398.0 92017.0 109620.0 10941.0 76010.0 \n", + "43348 101.0 39774.0 11127.0 45989.0 24596.0 11933.0 170.0 \n", + "\n", + " 7 8 9 ... 57 58 59 60 61 62 63 \\\n", + "0 40840.0 276.0 31623.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "1 24883.0 117.0 23763.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2 14399.0 48271.0 76686.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "3 38959.0 22934.0 10147.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "4 10288.0 12524.0 13878.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... ... ... ... ... ... ... ... \n", + "43344 21388.0 10245.0 92067.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43345 10163.0 11062.0 276.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43346 21899.0 11861.0 10561.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43347 10115.0 19830.0 26083.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43348 17145.0 10710.0 39125.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " labels Anomaly Anomaly_Score \n", + "0 0.0 0 70971.171936 \n", + "1 0.0 0 77147.550363 \n", + "2 0.0 0 118676.465801 \n", + "3 0.0 0 94310.765409 \n", + "4 0.0 0 63569.489655 \n", + "... ... ... ... \n", + "43344 1.0 0 183310.474995 \n", + "43345 1.0 0 140717.435036 \n", + "43346 1.0 0 98954.428628 \n", + "43347 1.0 0 65424.117159 \n", + "43348 1.0 0 182332.274049 \n", + "\n", + "[43349 rows x 67 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 41376\n", + "1 1973\n", + "Name: Anomaly, dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_pred['Anomaly'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...5657585960616263labelstweet
0101101101757865318925285159764084027631623...000000000en güzel uyuyan insan ödülü jeon jungkook'a g...
11011158910706107131079494698306682488311723763...000000000Mekanı cennet olsun, saygılar sayın avukatımı...
21011483047110774137851377933642143994827176686...000000000Kızlar aranızda kas yığını beylere düşenler ol...
3101193191672410118101077832312407389592293410147...000000000Biraz ders çalışayım. Tembellik ve uyku düşman...
41013093258706580544490710224106583102881252413878...000000000Trezeguet yerine El Sharawy daha iyi olmaz mı
..................................................................
43344101200651016111511510378410774213881024592067...000000001Hil**adamlar kesinlikle kelimeleri anlamıyorla...
4334510113980839241091340618985162851016311062276...000000001Böyle piçlerin çok erken ölmemelerini ve çok f...
43346101105549102635101402694311499110516218991186110561...000000001Turgay denilen bu holigonda bir sorun yok, gur...
433471018142426398920171096201094176010101151983026083...000000001Umarım ülkenin düşük zekadan kurtulması ilgile...
433481013977411127459892459611933170171451071039125...000000001CHP sandıkları bırakmaz, üzerine oturur, bir c...
\n", + "

43349 rows × 66 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 \\\n", + "0 101 10110 175 78653 189 25285 15976 40840 276 \n", + "1 101 11589 10706 10713 10794 94698 30668 24883 117 \n", + "2 101 148 30471 10774 13785 13779 33642 14399 48271 \n", + "3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n", + "4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "43344 101 20065 10161 115 115 103784 10774 21388 10245 \n", + "43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n", + "43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n", + "43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n", + "43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n", + "\n", + " 9 ... 56 57 58 59 60 61 62 63 labels \\\n", + "0 31623 ... 0 0 0 0 0 0 0 0 0 \n", + "1 23763 ... 0 0 0 0 0 0 0 0 0 \n", + "2 76686 ... 0 0 0 0 0 0 0 0 0 \n", + "3 10147 ... 0 0 0 0 0 0 0 0 0 \n", + "4 13878 ... 0 0 0 0 0 0 0 0 0 \n", + "... ... ... .. .. .. .. .. .. .. .. ... \n", + "43344 92067 ... 0 0 0 0 0 0 0 0 1 \n", + "43345 276 ... 0 0 0 0 0 0 0 0 1 \n", + "43346 10561 ... 0 0 0 0 0 0 0 0 1 \n", + "43347 26083 ... 0 0 0 0 0 0 0 0 1 \n", + "43348 39125 ... 0 0 0 0 0 0 0 0 1 \n", + "\n", + " tweet \n", + "0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n", + "1 Mekanı cennet olsun, saygılar sayın avukatımı... \n", + "2 Kızlar aranızda kas yığını beylere düşenler ol... \n", + "3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n", + "4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n", + "... ... \n", + "43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n", + "43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n", + "43346 Turgay denilen bu holigonda bir sorun yok, gur... \n", + "43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n", + "43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n", + "\n", + "[43349 rows x 66 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embeding_df" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "embeding_df.drop(knn_pred.loc[knn_pred['Anomaly']==1 ].index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...5657585960616263labelstweet
0101101101757865318925285159764084027631623...000000000en güzel uyuyan insan ödülü jeon jungkook'a g...
11011158910706107131079494698306682488311723763...000000000Mekanı cennet olsun, saygılar sayın avukatımı...
21011483047110774137851377933642143994827176686...000000000Kızlar aranızda kas yığını beylere düşenler ol...
3101193191672410118101077832312407389592293410147...000000000Biraz ders çalışayım. Tembellik ve uyku düşman...
41013093258706580544490710224106583102881252413878...000000000Trezeguet yerine El Sharawy daha iyi olmaz mı
..................................................................
43344101200651016111511510378410774213881024592067...000000001Hil**adamlar kesinlikle kelimeleri anlamıyorla...
4334510113980839241091340618985162851016311062276...000000001Böyle piçlerin çok erken ölmemelerini ve çok f...
43346101105549102635101402694311499110516218991186110561...000000001Turgay denilen bu holigonda bir sorun yok, gur...
433471018142426398920171096201094176010101151983026083...000000001Umarım ülkenin düşük zekadan kurtulması ilgile...
433481013977411127459892459611933170171451071039125...000000001CHP sandıkları bırakmaz, üzerine oturur, bir c...
\n", + "

41376 rows × 66 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 \\\n", + "0 101 10110 175 78653 189 25285 15976 40840 276 \n", + "1 101 11589 10706 10713 10794 94698 30668 24883 117 \n", + "2 101 148 30471 10774 13785 13779 33642 14399 48271 \n", + "3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n", + "4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n", + "... ... ... ... ... ... ... ... ... ... \n", + "43344 101 20065 10161 115 115 103784 10774 21388 10245 \n", + "43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n", + "43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n", + "43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n", + "43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n", + "\n", + " 9 ... 56 57 58 59 60 61 62 63 labels \\\n", + "0 31623 ... 0 0 0 0 0 0 0 0 0 \n", + "1 23763 ... 0 0 0 0 0 0 0 0 0 \n", + "2 76686 ... 0 0 0 0 0 0 0 0 0 \n", + "3 10147 ... 0 0 0 0 0 0 0 0 0 \n", + "4 13878 ... 0 0 0 0 0 0 0 0 0 \n", + "... ... ... .. .. .. .. .. .. .. .. ... \n", + "43344 92067 ... 0 0 0 0 0 0 0 0 1 \n", + "43345 276 ... 0 0 0 0 0 0 0 0 1 \n", + "43346 10561 ... 0 0 0 0 0 0 0 0 1 \n", + "43347 26083 ... 0 0 0 0 0 0 0 0 1 \n", + "43348 39125 ... 0 0 0 0 0 0 0 0 1 \n", + "\n", + " tweet \n", + "0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n", + "1 Mekanı cennet olsun, saygılar sayın avukatımı... \n", + "2 Kızlar aranızda kas yığını beylere düşenler ol... \n", + "3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n", + "4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n", + "... ... \n", + "43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n", + "43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n", + "43346 Turgay denilen bu holigonda bir sorun yok, gur... \n", + "43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n", + "43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n", + "\n", + "[41376 rows x 66 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embeding_df" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "df=pd.DataFrame()\n", + "df['tweet']=embeding_df['tweet']\n", + "df['subtas_a']=embeding_df['labels']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('knn_outliers.csv') " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# pca" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...57585960616263labelsAnomalyAnomaly_Score
0101.010110.0175.078653.0189.025285.015976.040840.0276.031623.0...0.00.00.00.00.00.00.00.001.354399e+32
1101.011589.010706.010713.010794.094698.030668.024883.0117.023763.0...0.00.00.00.00.00.00.00.001.311723e+32
2101.0148.030471.010774.013785.013779.033642.014399.048271.076686.0...0.00.00.00.00.00.00.00.001.597792e+32
3101.019319.016724.010118.010107.078323.012407.038959.022934.010147.0...0.00.00.00.00.00.00.00.001.551488e+32
4101.030932.058706.058054.044907.010224.0106583.010288.012524.013878.0...0.00.00.00.00.00.00.00.001.348867e+32
..................................................................
43344101.020065.010161.0115.0115.0103784.010774.021388.010245.092067.0...0.00.00.00.00.00.00.01.002.346619e+32
43345101.0139.080839.024109.013406.018985.016285.010163.011062.0276.0...0.00.00.00.00.00.00.01.001.778253e+32
43346101.0105549.0102635.010140.026943.011499.0110516.021899.011861.010561.0...0.00.00.00.00.00.00.01.001.762300e+32
43347101.081424.026398.092017.0109620.010941.076010.010115.019830.026083.0...0.00.00.00.00.00.00.01.001.564075e+32
43348101.039774.011127.045989.024596.011933.0170.017145.010710.039125.0...0.00.00.00.00.00.00.01.002.685411e+32
\n", + "

43349 rows × 67 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 101.0 10110.0 175.0 78653.0 189.0 25285.0 15976.0 \n", + "1 101.0 11589.0 10706.0 10713.0 10794.0 94698.0 30668.0 \n", + "2 101.0 148.0 30471.0 10774.0 13785.0 13779.0 33642.0 \n", + "3 101.0 19319.0 16724.0 10118.0 10107.0 78323.0 12407.0 \n", + "4 101.0 30932.0 58706.0 58054.0 44907.0 10224.0 106583.0 \n", + "... ... ... ... ... ... ... ... \n", + "43344 101.0 20065.0 10161.0 115.0 115.0 103784.0 10774.0 \n", + "43345 101.0 139.0 80839.0 24109.0 13406.0 18985.0 16285.0 \n", + "43346 101.0 105549.0 102635.0 10140.0 26943.0 11499.0 110516.0 \n", + "43347 101.0 81424.0 26398.0 92017.0 109620.0 10941.0 76010.0 \n", + "43348 101.0 39774.0 11127.0 45989.0 24596.0 11933.0 170.0 \n", + "\n", + " 7 8 9 ... 57 58 59 60 61 62 63 \\\n", + "0 40840.0 276.0 31623.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "1 24883.0 117.0 23763.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2 14399.0 48271.0 76686.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "3 38959.0 22934.0 10147.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "4 10288.0 12524.0 13878.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... ... ... ... ... ... ... ... \n", + "43344 21388.0 10245.0 92067.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43345 10163.0 11062.0 276.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43346 21899.0 11861.0 10561.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43347 10115.0 19830.0 26083.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43348 17145.0 10710.0 39125.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " labels Anomaly Anomaly_Score \n", + "0 0.0 0 1.354399e+32 \n", + "1 0.0 0 1.311723e+32 \n", + "2 0.0 0 1.597792e+32 \n", + "3 0.0 0 1.551488e+32 \n", + "4 0.0 0 1.348867e+32 \n", + "... ... ... ... \n", + "43344 1.0 0 2.346619e+32 \n", + "43345 1.0 0 1.778253e+32 \n", + "43346 1.0 0 1.762300e+32 \n", + "43347 1.0 0 1.564075e+32 \n", + "43348 1.0 0 2.685411e+32 \n", + "\n", + "[43349 rows x 67 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pca = create_model('pca')\n", + "pca_anomalies = assign_model(pca)\n", + "pca_df=embeding_df.drop(['tweet'], axis=1)\n", + "pca_pred = predict_model(pca, data=pca_df)\n", + "pca_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 41181\n", + "1 2168\n", + "Name: Anomaly, dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pca_pred['Anomaly'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "embeding_df.drop(pca_pred.loc[pca_pred['Anomaly']==1 ].index, inplace=True)\n", + "df=pd.DataFrame()\n", + "df['tweet']=embeding_df['tweet']\n", + "df['subtas_a']=embeding_df['labels']\n", + "df.to_csv('pca_outliers.csv') " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# abod" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...57585960616263labelsAnomalyAnomaly_Score
0101.010110.0175.078653.0189.025285.015976.040840.0276.031623.0...0.00.00.00.00.00.00.00.00-7.719921e-22
1101.011589.010706.010713.010794.094698.030668.024883.0117.023763.0...0.00.00.00.00.00.00.00.00-4.030618e-21
2101.0148.030471.010774.013785.013779.033642.014399.048271.076686.0...0.00.00.00.00.00.00.00.00-3.558939e-22
3101.019319.016724.010118.010107.078323.012407.038959.022934.010147.0...0.00.00.00.00.00.00.00.00-2.895136e-22
4101.030932.058706.058054.044907.010224.0106583.010288.012524.013878.0...0.00.00.00.00.00.00.00.00-4.832515e-21
..................................................................
43344101.020065.010161.0115.0115.0103784.010774.021388.010245.092067.0...0.00.00.00.00.00.00.01.00-7.984637e-25
43345101.0139.080839.024109.013406.018985.016285.010163.011062.0276.0...0.00.00.00.00.00.00.01.00-1.059387e-22
43346101.0105549.0102635.010140.026943.011499.0110516.021899.011861.010561.0...0.00.00.00.00.00.00.01.00-3.592603e-22
43347101.081424.026398.092017.0109620.010941.076010.010115.019830.026083.0...0.00.00.00.00.00.00.01.00-2.226002e-21
43348101.039774.011127.045989.024596.011933.0170.017145.010710.039125.0...0.00.00.00.00.00.00.01.00-2.864757e-23
\n", + "

43349 rows × 67 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 101.0 10110.0 175.0 78653.0 189.0 25285.0 15976.0 \n", + "1 101.0 11589.0 10706.0 10713.0 10794.0 94698.0 30668.0 \n", + "2 101.0 148.0 30471.0 10774.0 13785.0 13779.0 33642.0 \n", + "3 101.0 19319.0 16724.0 10118.0 10107.0 78323.0 12407.0 \n", + "4 101.0 30932.0 58706.0 58054.0 44907.0 10224.0 106583.0 \n", + "... ... ... ... ... ... ... ... \n", + "43344 101.0 20065.0 10161.0 115.0 115.0 103784.0 10774.0 \n", + "43345 101.0 139.0 80839.0 24109.0 13406.0 18985.0 16285.0 \n", + "43346 101.0 105549.0 102635.0 10140.0 26943.0 11499.0 110516.0 \n", + "43347 101.0 81424.0 26398.0 92017.0 109620.0 10941.0 76010.0 \n", + "43348 101.0 39774.0 11127.0 45989.0 24596.0 11933.0 170.0 \n", + "\n", + " 7 8 9 ... 57 58 59 60 61 62 63 \\\n", + "0 40840.0 276.0 31623.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "1 24883.0 117.0 23763.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2 14399.0 48271.0 76686.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "3 38959.0 22934.0 10147.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "4 10288.0 12524.0 13878.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... ... ... ... ... ... ... ... \n", + "43344 21388.0 10245.0 92067.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43345 10163.0 11062.0 276.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43346 21899.0 11861.0 10561.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43347 10115.0 19830.0 26083.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43348 17145.0 10710.0 39125.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " labels Anomaly Anomaly_Score \n", + "0 0.0 0 -7.719921e-22 \n", + "1 0.0 0 -4.030618e-21 \n", + "2 0.0 0 -3.558939e-22 \n", + "3 0.0 0 -2.895136e-22 \n", + "4 0.0 0 -4.832515e-21 \n", + "... ... ... ... \n", + "43344 1.0 0 -7.984637e-25 \n", + "43345 1.0 0 -1.059387e-22 \n", + "43346 1.0 0 -3.592603e-22 \n", + "43347 1.0 0 -2.226002e-21 \n", + "43348 1.0 0 -2.864757e-23 \n", + "\n", + "[43349 rows x 67 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "abod = create_model('abod')\n", + "abod_anomalies = assign_model(abod)\n", + "abod_df=embeding_df.drop(['tweet'], axis=1)\n", + "abod_pred = predict_model(abod, data=abod_df)\n", + "abod_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 43349\n", + "Name: Anomaly, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "abod_pred['Anomaly'].value_counts()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Initiated. . . . . . . . . . . . . . . . . .16:19:48
Status. . . . . . . . . . . . . . . . . .Fitting 0.05 Fraction
Estimator. . . . . . . . . . . . . . . . . .Clustering-Based Local Outlier
\n", + "
" + ], + "text/plain": [ + " \n", + " \n", + "Initiated . . . . . . . . . . . . . . . . . . 16:19:48\n", + "Status . . . . . . . . . . . . . . . . . . Fitting 0.05 Fraction\n", + "Estimator . . . . . . . . . . . . . . . . . . Clustering-Based Local Outlier" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...57585960616263labelsAnomalyAnomaly_Score
0101.010110.0175.078653.0189.025285.015976.040840.0276.031623.0...0.00.00.00.00.00.00.00.00123828.159076
1101.011589.010706.010713.010794.094698.030668.024883.0117.023763.0...0.00.00.00.00.00.00.00.00112972.396566
2101.0148.030471.010774.013785.013779.033642.014399.048271.076686.0...0.00.00.00.00.00.00.00.00145701.165368
3101.019319.016724.010118.010107.078323.012407.038959.022934.010147.0...0.00.00.00.00.00.00.00.00141686.216880
4101.030932.058706.058054.044907.010224.0106583.010288.012524.013878.0...0.00.00.00.00.00.00.00.00101399.757887
..................................................................
43344101.020065.010161.0115.0115.0103784.010774.021388.010245.092067.0...0.00.00.00.00.00.00.01.00193403.127721
43345101.0139.080839.024109.013406.018985.016285.010163.011062.0276.0...0.00.00.00.00.00.00.01.00154821.530684
43346101.0105549.0102635.010140.026943.011499.0110516.021899.011861.010561.0...0.00.00.00.00.00.00.01.00166024.182457
43347101.081424.026398.092017.0109620.010941.076010.010115.019830.026083.0...0.00.00.00.00.00.00.01.00130852.856537
43348101.039774.011127.045989.024596.011933.0170.017145.010710.039125.0...0.00.00.00.00.00.00.01.00206109.572124
\n", + "

43349 rows × 67 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 101.0 10110.0 175.0 78653.0 189.0 25285.0 15976.0 \n", + "1 101.0 11589.0 10706.0 10713.0 10794.0 94698.0 30668.0 \n", + "2 101.0 148.0 30471.0 10774.0 13785.0 13779.0 33642.0 \n", + "3 101.0 19319.0 16724.0 10118.0 10107.0 78323.0 12407.0 \n", + "4 101.0 30932.0 58706.0 58054.0 44907.0 10224.0 106583.0 \n", + "... ... ... ... ... ... ... ... \n", + "43344 101.0 20065.0 10161.0 115.0 115.0 103784.0 10774.0 \n", + "43345 101.0 139.0 80839.0 24109.0 13406.0 18985.0 16285.0 \n", + "43346 101.0 105549.0 102635.0 10140.0 26943.0 11499.0 110516.0 \n", + "43347 101.0 81424.0 26398.0 92017.0 109620.0 10941.0 76010.0 \n", + "43348 101.0 39774.0 11127.0 45989.0 24596.0 11933.0 170.0 \n", + "\n", + " 7 8 9 ... 57 58 59 60 61 62 63 \\\n", + "0 40840.0 276.0 31623.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "1 24883.0 117.0 23763.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2 14399.0 48271.0 76686.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "3 38959.0 22934.0 10147.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "4 10288.0 12524.0 13878.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... ... ... ... ... ... ... ... \n", + "43344 21388.0 10245.0 92067.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43345 10163.0 11062.0 276.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43346 21899.0 11861.0 10561.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43347 10115.0 19830.0 26083.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "43348 17145.0 10710.0 39125.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " labels Anomaly Anomaly_Score \n", + "0 0.0 0 123828.159076 \n", + "1 0.0 0 112972.396566 \n", + "2 0.0 0 145701.165368 \n", + "3 0.0 0 141686.216880 \n", + "4 0.0 0 101399.757887 \n", + "... ... ... ... \n", + "43344 1.0 0 193403.127721 \n", + "43345 1.0 0 154821.530684 \n", + "43346 1.0 0 166024.182457 \n", + "43347 1.0 0 130852.856537 \n", + "43348 1.0 0 206109.572124 \n", + "\n", + "[43349 rows x 67 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster = create_model('cluster')\n", + "cluster_anomalies = assign_model(cluster)\n", + "cluster_df=embeding_df.drop(['tweet'], axis=1)\n", + "cluster_pred = predict_model(cluster, data=cluster_df)\n", + "cluster_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 41182\n", + "1 2167\n", + "Name: Anomaly, dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster_pred['Anomaly'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "embeding_df.drop(cluster_pred.loc[cluster_pred['Anomaly']==1 ].index, inplace=True)\n", + "df=pd.DataFrame()\n", + "df['tweet']=embeding_df['tweet']\n", + "df['subtas_a']=embeding_df['labels']\n", + "df.to_csv('cluster_outliers.csv') " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# cof" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cof = create_model('cof')\n", + "cof_anomalies = assign_model(cof)\n", + "cof_df=embeding_df.drop(['tweet'], axis=1)\n", + "cof_pred = predict_model(cof, data=cof_df)\n", + "cof_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cof_pred['Anomaly'].value_counts()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# histogram" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "histogram = create_model('histogram')\n", + "histogram_anomalies = assign_model(histogram)\n", + "histogram_df=embeding_df.drop(['tweet'], axis=1)\n", + "histogram_pred = predict_model(histogram, data=histogram_df)\n", + "histogram_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "histogram_pred['Anomaly'].value_counts()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# lof" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lof = create_model('lof')\n", + "lof_anomalies = assign_model(lof)\n", + "lof_df=embeding_df.drop(['tweet'], axis=1)\n", + "lof_pred = predict_model(lof, data=lof_df)\n", + "lof_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lof_pred['Anomaly'].value_counts()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# svm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lof = create_model('lof')\n", + "lof_anomalies = assign_model(lof)\n", + "lof_df=embeding_df.drop(['tweet'], axis=1)\n", + "lof_pred = predict_model(lof, data=lof_df)\n", + "lof_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lof_pred['Anomaly'].value_counts()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# mcd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mcd = create_model('mcd')\n", + "mcd_anomalies = assign_model(mcd)\n", + "mcd_df=embeding_df.drop(['tweet'], axis=1)\n", + "mcd_pred = predict_model(mcd, data=mcd_df)\n", + "mcd_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mcd_pred['Anomaly'].value_counts()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# sod" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sod = create_model('sod')\n", + "sod_anomalies = assign_model(sod)\n", + "sod_df=embeding_df.drop(['tweet'], axis=1)\n", + "sod_pred = predict_model(sod, data=sod_df)\n", + "sod_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sod_pred['Anomaly'].value_counts()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# sos" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sos = create_model('sos')\n", + "sos_anomalies = assign_model(sos)\n", + "sos_df=embeding_df.drop(['tweet'], axis=1)\n", + "sos_pred = predict_model(sos, data=sos_df)\n", + "sos_pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sos_pred['Anomaly'].value_counts()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dl_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}