diff --git "a/pycaret_outlier_detection.ipynb" "b/pycaret_outlier_detection.ipynb"
new file mode 100644--- /dev/null
+++ "b/pycaret_outlier_detection.ipynb"
@@ -0,0 +1,3893 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "import tqdm\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns;\n",
+ "\n",
+ "from sklearn.datasets import fetch_20newsgroups\n",
+ "from sklearn.manifold import TSNE\n",
+ "from pycaret.anomaly import *\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "embeding_df=pd.read_csv('/mnt/c/Users/selin_uzturk/Desktop/sinkaf/encoded.csv')\n",
+ "embeding_df=embeding_df.drop(['Unnamed: 0'], axis=1)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " ... | \n",
+ " 56 | \n",
+ " 57 | \n",
+ " 58 | \n",
+ " 59 | \n",
+ " 60 | \n",
+ " 61 | \n",
+ " 62 | \n",
+ " 63 | \n",
+ " labels | \n",
+ " tweet | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101 | \n",
+ " 10110 | \n",
+ " 175 | \n",
+ " 78653 | \n",
+ " 189 | \n",
+ " 25285 | \n",
+ " 15976 | \n",
+ " 40840 | \n",
+ " 276 | \n",
+ " 31623 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " en güzel uyuyan insan ödülü jeon jungkook'a g... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 101 | \n",
+ " 11589 | \n",
+ " 10706 | \n",
+ " 10713 | \n",
+ " 10794 | \n",
+ " 94698 | \n",
+ " 30668 | \n",
+ " 24883 | \n",
+ " 117 | \n",
+ " 23763 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Mekanı cennet olsun, saygılar sayın avukatımı... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 101 | \n",
+ " 148 | \n",
+ " 30471 | \n",
+ " 10774 | \n",
+ " 13785 | \n",
+ " 13779 | \n",
+ " 33642 | \n",
+ " 14399 | \n",
+ " 48271 | \n",
+ " 76686 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Kızlar aranızda kas yığını beylere düşenler ol... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 101 | \n",
+ " 19319 | \n",
+ " 16724 | \n",
+ " 10118 | \n",
+ " 10107 | \n",
+ " 78323 | \n",
+ " 12407 | \n",
+ " 38959 | \n",
+ " 22934 | \n",
+ " 10147 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Biraz ders çalışayım. Tembellik ve uyku düşman... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 101 | \n",
+ " 30932 | \n",
+ " 58706 | \n",
+ " 58054 | \n",
+ " 44907 | \n",
+ " 10224 | \n",
+ " 106583 | \n",
+ " 10288 | \n",
+ " 12524 | \n",
+ " 13878 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Trezeguet yerine El Sharawy daha iyi olmaz mı | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 43344 | \n",
+ " 101 | \n",
+ " 20065 | \n",
+ " 10161 | \n",
+ " 115 | \n",
+ " 115 | \n",
+ " 103784 | \n",
+ " 10774 | \n",
+ " 21388 | \n",
+ " 10245 | \n",
+ " 92067 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Hil**adamlar kesinlikle kelimeleri anlamıyorla... | \n",
+ "
\n",
+ " \n",
+ " 43345 | \n",
+ " 101 | \n",
+ " 139 | \n",
+ " 80839 | \n",
+ " 24109 | \n",
+ " 13406 | \n",
+ " 18985 | \n",
+ " 16285 | \n",
+ " 10163 | \n",
+ " 11062 | \n",
+ " 276 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Böyle piçlerin çok erken ölmemelerini ve çok f... | \n",
+ "
\n",
+ " \n",
+ " 43346 | \n",
+ " 101 | \n",
+ " 105549 | \n",
+ " 102635 | \n",
+ " 10140 | \n",
+ " 26943 | \n",
+ " 11499 | \n",
+ " 110516 | \n",
+ " 21899 | \n",
+ " 11861 | \n",
+ " 10561 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Turgay denilen bu holigonda bir sorun yok, gur... | \n",
+ "
\n",
+ " \n",
+ " 43347 | \n",
+ " 101 | \n",
+ " 81424 | \n",
+ " 26398 | \n",
+ " 92017 | \n",
+ " 109620 | \n",
+ " 10941 | \n",
+ " 76010 | \n",
+ " 10115 | \n",
+ " 19830 | \n",
+ " 26083 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Umarım ülkenin düşük zekadan kurtulması ilgile... | \n",
+ "
\n",
+ " \n",
+ " 43348 | \n",
+ " 101 | \n",
+ " 39774 | \n",
+ " 11127 | \n",
+ " 45989 | \n",
+ " 24596 | \n",
+ " 11933 | \n",
+ " 170 | \n",
+ " 17145 | \n",
+ " 10710 | \n",
+ " 39125 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " CHP sandıkları bırakmaz, üzerine oturur, bir c... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
43349 rows × 66 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 7 8 \\\n",
+ "0 101 10110 175 78653 189 25285 15976 40840 276 \n",
+ "1 101 11589 10706 10713 10794 94698 30668 24883 117 \n",
+ "2 101 148 30471 10774 13785 13779 33642 14399 48271 \n",
+ "3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n",
+ "4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n",
+ "... ... ... ... ... ... ... ... ... ... \n",
+ "43344 101 20065 10161 115 115 103784 10774 21388 10245 \n",
+ "43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n",
+ "43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n",
+ "43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n",
+ "43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n",
+ "\n",
+ " 9 ... 56 57 58 59 60 61 62 63 labels \\\n",
+ "0 31623 ... 0 0 0 0 0 0 0 0 0 \n",
+ "1 23763 ... 0 0 0 0 0 0 0 0 0 \n",
+ "2 76686 ... 0 0 0 0 0 0 0 0 0 \n",
+ "3 10147 ... 0 0 0 0 0 0 0 0 0 \n",
+ "4 13878 ... 0 0 0 0 0 0 0 0 0 \n",
+ "... ... ... .. .. .. .. .. .. .. .. ... \n",
+ "43344 92067 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43345 276 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43346 10561 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43347 26083 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43348 39125 ... 0 0 0 0 0 0 0 0 1 \n",
+ "\n",
+ " tweet \n",
+ "0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n",
+ "1 Mekanı cennet olsun, saygılar sayın avukatımı... \n",
+ "2 Kızlar aranızda kas yığını beylere düşenler ol... \n",
+ "3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n",
+ "4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n",
+ "... ... \n",
+ "43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n",
+ "43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n",
+ "43346 Turgay denilen bu holigonda bir sorun yok, gur... \n",
+ "43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n",
+ "43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n",
+ "\n",
+ "[43349 rows x 66 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "embeding_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Description | \n",
+ " Value | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Session id | \n",
+ " 5272 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Original data shape | \n",
+ " (43349, 66) | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Transformed data shape | \n",
+ " (43349, 65) | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Ignore features | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Numeric features | \n",
+ " 65 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Preprocess | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Imputation type | \n",
+ " simple | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Numeric imputation | \n",
+ " mean | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Categorical imputation | \n",
+ " mode | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " CPU Jobs | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Use GPU | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Log Experiment | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Experiment Name | \n",
+ " anomaly-default-name | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " USI | \n",
+ " ca74 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "ano1= setup(embeding_df,ignore_features=['tweet'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Name | \n",
+ " Reference | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " abod | \n",
+ " Angle-base Outlier Detection | \n",
+ " pyod.models.abod.ABOD | \n",
+ "
\n",
+ " \n",
+ " cluster | \n",
+ " Clustering-Based Local Outlier | \n",
+ " pycaret.internal.patches.pyod.CBLOFForceToDouble | \n",
+ "
\n",
+ " \n",
+ " cof | \n",
+ " Connectivity-Based Local Outlier | \n",
+ " pyod.models.cof.COF | \n",
+ "
\n",
+ " \n",
+ " iforest | \n",
+ " Isolation Forest | \n",
+ " pyod.models.iforest.IForest | \n",
+ "
\n",
+ " \n",
+ " histogram | \n",
+ " Histogram-based Outlier Detection | \n",
+ " pyod.models.hbos.HBOS | \n",
+ "
\n",
+ " \n",
+ " knn | \n",
+ " K-Nearest Neighbors Detector | \n",
+ " pyod.models.knn.KNN | \n",
+ "
\n",
+ " \n",
+ " lof | \n",
+ " Local Outlier Factor | \n",
+ " pyod.models.lof.LOF | \n",
+ "
\n",
+ " \n",
+ " svm | \n",
+ " One-class SVM detector | \n",
+ " pyod.models.ocsvm.OCSVM | \n",
+ "
\n",
+ " \n",
+ " pca | \n",
+ " Principal Component Analysis | \n",
+ " pyod.models.pca.PCA | \n",
+ "
\n",
+ " \n",
+ " mcd | \n",
+ " Minimum Covariance Determinant | \n",
+ " pyod.models.mcd.MCD | \n",
+ "
\n",
+ " \n",
+ " sod | \n",
+ " Subspace Outlier Detection | \n",
+ " pyod.models.sod.SOD | \n",
+ "
\n",
+ " \n",
+ " sos | \n",
+ " Stochastic Outlier Selection | \n",
+ " pyod.models.sos.SOS | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Name \\\n",
+ "ID \n",
+ "abod Angle-base Outlier Detection \n",
+ "cluster Clustering-Based Local Outlier \n",
+ "cof Connectivity-Based Local Outlier \n",
+ "iforest Isolation Forest \n",
+ "histogram Histogram-based Outlier Detection \n",
+ "knn K-Nearest Neighbors Detector \n",
+ "lof Local Outlier Factor \n",
+ "svm One-class SVM detector \n",
+ "pca Principal Component Analysis \n",
+ "mcd Minimum Covariance Determinant \n",
+ "sod Subspace Outlier Detection \n",
+ "sos Stochastic Outlier Selection \n",
+ "\n",
+ " Reference \n",
+ "ID \n",
+ "abod pyod.models.abod.ABOD \n",
+ "cluster pycaret.internal.patches.pyod.CBLOFForceToDouble \n",
+ "cof pyod.models.cof.COF \n",
+ "iforest pyod.models.iforest.IForest \n",
+ "histogram pyod.models.hbos.HBOS \n",
+ "knn pyod.models.knn.KNN \n",
+ "lof pyod.models.lof.LOF \n",
+ "svm pyod.models.ocsvm.OCSVM \n",
+ "pca pyod.models.pca.PCA \n",
+ "mcd pyod.models.mcd.MCD \n",
+ "sod pyod.models.sod.SOD \n",
+ "sos pyod.models.sos.SOS "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "models()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# iforest"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# iforest = create_model('iforest')\n",
+ "# iforest_anomalies = assign_model(iforest)\n",
+ "# # iso_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "# iforest_pred = predict_model(iforest, data=iso_df)\n",
+ "# iforest_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# iforest_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# knn\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " ... | \n",
+ " 57 | \n",
+ " 58 | \n",
+ " 59 | \n",
+ " 60 | \n",
+ " 61 | \n",
+ " 62 | \n",
+ " 63 | \n",
+ " labels | \n",
+ " Anomaly | \n",
+ " Anomaly_Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101.0 | \n",
+ " 10110.0 | \n",
+ " 175.0 | \n",
+ " 78653.0 | \n",
+ " 189.0 | \n",
+ " 25285.0 | \n",
+ " 15976.0 | \n",
+ " 40840.0 | \n",
+ " 276.0 | \n",
+ " 31623.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 70971.171936 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 101.0 | \n",
+ " 11589.0 | \n",
+ " 10706.0 | \n",
+ " 10713.0 | \n",
+ " 10794.0 | \n",
+ " 94698.0 | \n",
+ " 30668.0 | \n",
+ " 24883.0 | \n",
+ " 117.0 | \n",
+ " 23763.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 77147.550363 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 101.0 | \n",
+ " 148.0 | \n",
+ " 30471.0 | \n",
+ " 10774.0 | \n",
+ " 13785.0 | \n",
+ " 13779.0 | \n",
+ " 33642.0 | \n",
+ " 14399.0 | \n",
+ " 48271.0 | \n",
+ " 76686.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 118676.465801 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 101.0 | \n",
+ " 19319.0 | \n",
+ " 16724.0 | \n",
+ " 10118.0 | \n",
+ " 10107.0 | \n",
+ " 78323.0 | \n",
+ " 12407.0 | \n",
+ " 38959.0 | \n",
+ " 22934.0 | \n",
+ " 10147.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 94310.765409 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 101.0 | \n",
+ " 30932.0 | \n",
+ " 58706.0 | \n",
+ " 58054.0 | \n",
+ " 44907.0 | \n",
+ " 10224.0 | \n",
+ " 106583.0 | \n",
+ " 10288.0 | \n",
+ " 12524.0 | \n",
+ " 13878.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 63569.489655 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 43344 | \n",
+ " 101.0 | \n",
+ " 20065.0 | \n",
+ " 10161.0 | \n",
+ " 115.0 | \n",
+ " 115.0 | \n",
+ " 103784.0 | \n",
+ " 10774.0 | \n",
+ " 21388.0 | \n",
+ " 10245.0 | \n",
+ " 92067.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 183310.474995 | \n",
+ "
\n",
+ " \n",
+ " 43345 | \n",
+ " 101.0 | \n",
+ " 139.0 | \n",
+ " 80839.0 | \n",
+ " 24109.0 | \n",
+ " 13406.0 | \n",
+ " 18985.0 | \n",
+ " 16285.0 | \n",
+ " 10163.0 | \n",
+ " 11062.0 | \n",
+ " 276.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 140717.435036 | \n",
+ "
\n",
+ " \n",
+ " 43346 | \n",
+ " 101.0 | \n",
+ " 105549.0 | \n",
+ " 102635.0 | \n",
+ " 10140.0 | \n",
+ " 26943.0 | \n",
+ " 11499.0 | \n",
+ " 110516.0 | \n",
+ " 21899.0 | \n",
+ " 11861.0 | \n",
+ " 10561.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 98954.428628 | \n",
+ "
\n",
+ " \n",
+ " 43347 | \n",
+ " 101.0 | \n",
+ " 81424.0 | \n",
+ " 26398.0 | \n",
+ " 92017.0 | \n",
+ " 109620.0 | \n",
+ " 10941.0 | \n",
+ " 76010.0 | \n",
+ " 10115.0 | \n",
+ " 19830.0 | \n",
+ " 26083.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 65424.117159 | \n",
+ "
\n",
+ " \n",
+ " 43348 | \n",
+ " 101.0 | \n",
+ " 39774.0 | \n",
+ " 11127.0 | \n",
+ " 45989.0 | \n",
+ " 24596.0 | \n",
+ " 11933.0 | \n",
+ " 170.0 | \n",
+ " 17145.0 | \n",
+ " 10710.0 | \n",
+ " 39125.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 182332.274049 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
43349 rows × 67 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 101.0 10110.0 175.0 78653.0 189.0 25285.0 15976.0 \n",
+ "1 101.0 11589.0 10706.0 10713.0 10794.0 94698.0 30668.0 \n",
+ "2 101.0 148.0 30471.0 10774.0 13785.0 13779.0 33642.0 \n",
+ "3 101.0 19319.0 16724.0 10118.0 10107.0 78323.0 12407.0 \n",
+ "4 101.0 30932.0 58706.0 58054.0 44907.0 10224.0 106583.0 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "43344 101.0 20065.0 10161.0 115.0 115.0 103784.0 10774.0 \n",
+ "43345 101.0 139.0 80839.0 24109.0 13406.0 18985.0 16285.0 \n",
+ "43346 101.0 105549.0 102635.0 10140.0 26943.0 11499.0 110516.0 \n",
+ "43347 101.0 81424.0 26398.0 92017.0 109620.0 10941.0 76010.0 \n",
+ "43348 101.0 39774.0 11127.0 45989.0 24596.0 11933.0 170.0 \n",
+ "\n",
+ " 7 8 9 ... 57 58 59 60 61 62 63 \\\n",
+ "0 40840.0 276.0 31623.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "1 24883.0 117.0 23763.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "2 14399.0 48271.0 76686.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "3 38959.0 22934.0 10147.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "4 10288.0 12524.0 13878.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "... ... ... ... ... ... ... ... ... ... ... ... \n",
+ "43344 21388.0 10245.0 92067.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43345 10163.0 11062.0 276.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43346 21899.0 11861.0 10561.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43347 10115.0 19830.0 26083.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43348 17145.0 10710.0 39125.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " labels Anomaly Anomaly_Score \n",
+ "0 0.0 0 70971.171936 \n",
+ "1 0.0 0 77147.550363 \n",
+ "2 0.0 0 118676.465801 \n",
+ "3 0.0 0 94310.765409 \n",
+ "4 0.0 0 63569.489655 \n",
+ "... ... ... ... \n",
+ "43344 1.0 0 183310.474995 \n",
+ "43345 1.0 0 140717.435036 \n",
+ "43346 1.0 0 98954.428628 \n",
+ "43347 1.0 0 65424.117159 \n",
+ "43348 1.0 0 182332.274049 \n",
+ "\n",
+ "[43349 rows x 67 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "knn = create_model('knn')\n",
+ "knn_anomalies = assign_model(knn)\n",
+ "knn_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "knn_pred = predict_model(knn, data=knn_df)\n",
+ "knn_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 41376\n",
+ "1 1973\n",
+ "Name: Anomaly, dtype: int64"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "knn_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " ... | \n",
+ " 57 | \n",
+ " 58 | \n",
+ " 59 | \n",
+ " 60 | \n",
+ " 61 | \n",
+ " 62 | \n",
+ " 63 | \n",
+ " labels | \n",
+ " Anomaly | \n",
+ " Anomaly_Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101.0 | \n",
+ " 10110.0 | \n",
+ " 175.0 | \n",
+ " 78653.0 | \n",
+ " 189.0 | \n",
+ " 25285.0 | \n",
+ " 15976.0 | \n",
+ " 40840.0 | \n",
+ " 276.0 | \n",
+ " 31623.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 70971.171936 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 101.0 | \n",
+ " 11589.0 | \n",
+ " 10706.0 | \n",
+ " 10713.0 | \n",
+ " 10794.0 | \n",
+ " 94698.0 | \n",
+ " 30668.0 | \n",
+ " 24883.0 | \n",
+ " 117.0 | \n",
+ " 23763.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 77147.550363 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 101.0 | \n",
+ " 148.0 | \n",
+ " 30471.0 | \n",
+ " 10774.0 | \n",
+ " 13785.0 | \n",
+ " 13779.0 | \n",
+ " 33642.0 | \n",
+ " 14399.0 | \n",
+ " 48271.0 | \n",
+ " 76686.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 118676.465801 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 101.0 | \n",
+ " 19319.0 | \n",
+ " 16724.0 | \n",
+ " 10118.0 | \n",
+ " 10107.0 | \n",
+ " 78323.0 | \n",
+ " 12407.0 | \n",
+ " 38959.0 | \n",
+ " 22934.0 | \n",
+ " 10147.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 94310.765409 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 101.0 | \n",
+ " 30932.0 | \n",
+ " 58706.0 | \n",
+ " 58054.0 | \n",
+ " 44907.0 | \n",
+ " 10224.0 | \n",
+ " 106583.0 | \n",
+ " 10288.0 | \n",
+ " 12524.0 | \n",
+ " 13878.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 63569.489655 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 43344 | \n",
+ " 101.0 | \n",
+ " 20065.0 | \n",
+ " 10161.0 | \n",
+ " 115.0 | \n",
+ " 115.0 | \n",
+ " 103784.0 | \n",
+ " 10774.0 | \n",
+ " 21388.0 | \n",
+ " 10245.0 | \n",
+ " 92067.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 183310.474995 | \n",
+ "
\n",
+ " \n",
+ " 43345 | \n",
+ " 101.0 | \n",
+ " 139.0 | \n",
+ " 80839.0 | \n",
+ " 24109.0 | \n",
+ " 13406.0 | \n",
+ " 18985.0 | \n",
+ " 16285.0 | \n",
+ " 10163.0 | \n",
+ " 11062.0 | \n",
+ " 276.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 140717.435036 | \n",
+ "
\n",
+ " \n",
+ " 43346 | \n",
+ " 101.0 | \n",
+ " 105549.0 | \n",
+ " 102635.0 | \n",
+ " 10140.0 | \n",
+ " 26943.0 | \n",
+ " 11499.0 | \n",
+ " 110516.0 | \n",
+ " 21899.0 | \n",
+ " 11861.0 | \n",
+ " 10561.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 98954.428628 | \n",
+ "
\n",
+ " \n",
+ " 43347 | \n",
+ " 101.0 | \n",
+ " 81424.0 | \n",
+ " 26398.0 | \n",
+ " 92017.0 | \n",
+ " 109620.0 | \n",
+ " 10941.0 | \n",
+ " 76010.0 | \n",
+ " 10115.0 | \n",
+ " 19830.0 | \n",
+ " 26083.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 65424.117159 | \n",
+ "
\n",
+ " \n",
+ " 43348 | \n",
+ " 101.0 | \n",
+ " 39774.0 | \n",
+ " 11127.0 | \n",
+ " 45989.0 | \n",
+ " 24596.0 | \n",
+ " 11933.0 | \n",
+ " 170.0 | \n",
+ " 17145.0 | \n",
+ " 10710.0 | \n",
+ " 39125.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 182332.274049 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
43349 rows × 67 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 101.0 10110.0 175.0 78653.0 189.0 25285.0 15976.0 \n",
+ "1 101.0 11589.0 10706.0 10713.0 10794.0 94698.0 30668.0 \n",
+ "2 101.0 148.0 30471.0 10774.0 13785.0 13779.0 33642.0 \n",
+ "3 101.0 19319.0 16724.0 10118.0 10107.0 78323.0 12407.0 \n",
+ "4 101.0 30932.0 58706.0 58054.0 44907.0 10224.0 106583.0 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "43344 101.0 20065.0 10161.0 115.0 115.0 103784.0 10774.0 \n",
+ "43345 101.0 139.0 80839.0 24109.0 13406.0 18985.0 16285.0 \n",
+ "43346 101.0 105549.0 102635.0 10140.0 26943.0 11499.0 110516.0 \n",
+ "43347 101.0 81424.0 26398.0 92017.0 109620.0 10941.0 76010.0 \n",
+ "43348 101.0 39774.0 11127.0 45989.0 24596.0 11933.0 170.0 \n",
+ "\n",
+ " 7 8 9 ... 57 58 59 60 61 62 63 \\\n",
+ "0 40840.0 276.0 31623.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "1 24883.0 117.0 23763.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "2 14399.0 48271.0 76686.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "3 38959.0 22934.0 10147.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "4 10288.0 12524.0 13878.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "... ... ... ... ... ... ... ... ... ... ... ... \n",
+ "43344 21388.0 10245.0 92067.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43345 10163.0 11062.0 276.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43346 21899.0 11861.0 10561.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43347 10115.0 19830.0 26083.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43348 17145.0 10710.0 39125.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " labels Anomaly Anomaly_Score \n",
+ "0 0.0 0 70971.171936 \n",
+ "1 0.0 0 77147.550363 \n",
+ "2 0.0 0 118676.465801 \n",
+ "3 0.0 0 94310.765409 \n",
+ "4 0.0 0 63569.489655 \n",
+ "... ... ... ... \n",
+ "43344 1.0 0 183310.474995 \n",
+ "43345 1.0 0 140717.435036 \n",
+ "43346 1.0 0 98954.428628 \n",
+ "43347 1.0 0 65424.117159 \n",
+ "43348 1.0 0 182332.274049 \n",
+ "\n",
+ "[43349 rows x 67 columns]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "knn_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 41376\n",
+ "1 1973\n",
+ "Name: Anomaly, dtype: int64"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "knn_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " ... | \n",
+ " 56 | \n",
+ " 57 | \n",
+ " 58 | \n",
+ " 59 | \n",
+ " 60 | \n",
+ " 61 | \n",
+ " 62 | \n",
+ " 63 | \n",
+ " labels | \n",
+ " tweet | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101 | \n",
+ " 10110 | \n",
+ " 175 | \n",
+ " 78653 | \n",
+ " 189 | \n",
+ " 25285 | \n",
+ " 15976 | \n",
+ " 40840 | \n",
+ " 276 | \n",
+ " 31623 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " en güzel uyuyan insan ödülü jeon jungkook'a g... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 101 | \n",
+ " 11589 | \n",
+ " 10706 | \n",
+ " 10713 | \n",
+ " 10794 | \n",
+ " 94698 | \n",
+ " 30668 | \n",
+ " 24883 | \n",
+ " 117 | \n",
+ " 23763 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Mekanı cennet olsun, saygılar sayın avukatımı... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 101 | \n",
+ " 148 | \n",
+ " 30471 | \n",
+ " 10774 | \n",
+ " 13785 | \n",
+ " 13779 | \n",
+ " 33642 | \n",
+ " 14399 | \n",
+ " 48271 | \n",
+ " 76686 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Kızlar aranızda kas yığını beylere düşenler ol... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 101 | \n",
+ " 19319 | \n",
+ " 16724 | \n",
+ " 10118 | \n",
+ " 10107 | \n",
+ " 78323 | \n",
+ " 12407 | \n",
+ " 38959 | \n",
+ " 22934 | \n",
+ " 10147 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Biraz ders çalışayım. Tembellik ve uyku düşman... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 101 | \n",
+ " 30932 | \n",
+ " 58706 | \n",
+ " 58054 | \n",
+ " 44907 | \n",
+ " 10224 | \n",
+ " 106583 | \n",
+ " 10288 | \n",
+ " 12524 | \n",
+ " 13878 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Trezeguet yerine El Sharawy daha iyi olmaz mı | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 43344 | \n",
+ " 101 | \n",
+ " 20065 | \n",
+ " 10161 | \n",
+ " 115 | \n",
+ " 115 | \n",
+ " 103784 | \n",
+ " 10774 | \n",
+ " 21388 | \n",
+ " 10245 | \n",
+ " 92067 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Hil**adamlar kesinlikle kelimeleri anlamıyorla... | \n",
+ "
\n",
+ " \n",
+ " 43345 | \n",
+ " 101 | \n",
+ " 139 | \n",
+ " 80839 | \n",
+ " 24109 | \n",
+ " 13406 | \n",
+ " 18985 | \n",
+ " 16285 | \n",
+ " 10163 | \n",
+ " 11062 | \n",
+ " 276 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Böyle piçlerin çok erken ölmemelerini ve çok f... | \n",
+ "
\n",
+ " \n",
+ " 43346 | \n",
+ " 101 | \n",
+ " 105549 | \n",
+ " 102635 | \n",
+ " 10140 | \n",
+ " 26943 | \n",
+ " 11499 | \n",
+ " 110516 | \n",
+ " 21899 | \n",
+ " 11861 | \n",
+ " 10561 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Turgay denilen bu holigonda bir sorun yok, gur... | \n",
+ "
\n",
+ " \n",
+ " 43347 | \n",
+ " 101 | \n",
+ " 81424 | \n",
+ " 26398 | \n",
+ " 92017 | \n",
+ " 109620 | \n",
+ " 10941 | \n",
+ " 76010 | \n",
+ " 10115 | \n",
+ " 19830 | \n",
+ " 26083 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Umarım ülkenin düşük zekadan kurtulması ilgile... | \n",
+ "
\n",
+ " \n",
+ " 43348 | \n",
+ " 101 | \n",
+ " 39774 | \n",
+ " 11127 | \n",
+ " 45989 | \n",
+ " 24596 | \n",
+ " 11933 | \n",
+ " 170 | \n",
+ " 17145 | \n",
+ " 10710 | \n",
+ " 39125 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " CHP sandıkları bırakmaz, üzerine oturur, bir c... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
43349 rows × 66 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 7 8 \\\n",
+ "0 101 10110 175 78653 189 25285 15976 40840 276 \n",
+ "1 101 11589 10706 10713 10794 94698 30668 24883 117 \n",
+ "2 101 148 30471 10774 13785 13779 33642 14399 48271 \n",
+ "3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n",
+ "4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n",
+ "... ... ... ... ... ... ... ... ... ... \n",
+ "43344 101 20065 10161 115 115 103784 10774 21388 10245 \n",
+ "43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n",
+ "43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n",
+ "43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n",
+ "43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n",
+ "\n",
+ " 9 ... 56 57 58 59 60 61 62 63 labels \\\n",
+ "0 31623 ... 0 0 0 0 0 0 0 0 0 \n",
+ "1 23763 ... 0 0 0 0 0 0 0 0 0 \n",
+ "2 76686 ... 0 0 0 0 0 0 0 0 0 \n",
+ "3 10147 ... 0 0 0 0 0 0 0 0 0 \n",
+ "4 13878 ... 0 0 0 0 0 0 0 0 0 \n",
+ "... ... ... .. .. .. .. .. .. .. .. ... \n",
+ "43344 92067 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43345 276 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43346 10561 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43347 26083 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43348 39125 ... 0 0 0 0 0 0 0 0 1 \n",
+ "\n",
+ " tweet \n",
+ "0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n",
+ "1 Mekanı cennet olsun, saygılar sayın avukatımı... \n",
+ "2 Kızlar aranızda kas yığını beylere düşenler ol... \n",
+ "3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n",
+ "4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n",
+ "... ... \n",
+ "43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n",
+ "43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n",
+ "43346 Turgay denilen bu holigonda bir sorun yok, gur... \n",
+ "43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n",
+ "43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n",
+ "\n",
+ "[43349 rows x 66 columns]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "embeding_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "embeding_df.drop(knn_pred.loc[knn_pred['Anomaly']==1 ].index, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " ... | \n",
+ " 56 | \n",
+ " 57 | \n",
+ " 58 | \n",
+ " 59 | \n",
+ " 60 | \n",
+ " 61 | \n",
+ " 62 | \n",
+ " 63 | \n",
+ " labels | \n",
+ " tweet | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101 | \n",
+ " 10110 | \n",
+ " 175 | \n",
+ " 78653 | \n",
+ " 189 | \n",
+ " 25285 | \n",
+ " 15976 | \n",
+ " 40840 | \n",
+ " 276 | \n",
+ " 31623 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " en güzel uyuyan insan ödülü jeon jungkook'a g... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 101 | \n",
+ " 11589 | \n",
+ " 10706 | \n",
+ " 10713 | \n",
+ " 10794 | \n",
+ " 94698 | \n",
+ " 30668 | \n",
+ " 24883 | \n",
+ " 117 | \n",
+ " 23763 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Mekanı cennet olsun, saygılar sayın avukatımı... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 101 | \n",
+ " 148 | \n",
+ " 30471 | \n",
+ " 10774 | \n",
+ " 13785 | \n",
+ " 13779 | \n",
+ " 33642 | \n",
+ " 14399 | \n",
+ " 48271 | \n",
+ " 76686 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Kızlar aranızda kas yığını beylere düşenler ol... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 101 | \n",
+ " 19319 | \n",
+ " 16724 | \n",
+ " 10118 | \n",
+ " 10107 | \n",
+ " 78323 | \n",
+ " 12407 | \n",
+ " 38959 | \n",
+ " 22934 | \n",
+ " 10147 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Biraz ders çalışayım. Tembellik ve uyku düşman... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 101 | \n",
+ " 30932 | \n",
+ " 58706 | \n",
+ " 58054 | \n",
+ " 44907 | \n",
+ " 10224 | \n",
+ " 106583 | \n",
+ " 10288 | \n",
+ " 12524 | \n",
+ " 13878 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Trezeguet yerine El Sharawy daha iyi olmaz mı | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 43344 | \n",
+ " 101 | \n",
+ " 20065 | \n",
+ " 10161 | \n",
+ " 115 | \n",
+ " 115 | \n",
+ " 103784 | \n",
+ " 10774 | \n",
+ " 21388 | \n",
+ " 10245 | \n",
+ " 92067 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Hil**adamlar kesinlikle kelimeleri anlamıyorla... | \n",
+ "
\n",
+ " \n",
+ " 43345 | \n",
+ " 101 | \n",
+ " 139 | \n",
+ " 80839 | \n",
+ " 24109 | \n",
+ " 13406 | \n",
+ " 18985 | \n",
+ " 16285 | \n",
+ " 10163 | \n",
+ " 11062 | \n",
+ " 276 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Böyle piçlerin çok erken ölmemelerini ve çok f... | \n",
+ "
\n",
+ " \n",
+ " 43346 | \n",
+ " 101 | \n",
+ " 105549 | \n",
+ " 102635 | \n",
+ " 10140 | \n",
+ " 26943 | \n",
+ " 11499 | \n",
+ " 110516 | \n",
+ " 21899 | \n",
+ " 11861 | \n",
+ " 10561 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Turgay denilen bu holigonda bir sorun yok, gur... | \n",
+ "
\n",
+ " \n",
+ " 43347 | \n",
+ " 101 | \n",
+ " 81424 | \n",
+ " 26398 | \n",
+ " 92017 | \n",
+ " 109620 | \n",
+ " 10941 | \n",
+ " 76010 | \n",
+ " 10115 | \n",
+ " 19830 | \n",
+ " 26083 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Umarım ülkenin düşük zekadan kurtulması ilgile... | \n",
+ "
\n",
+ " \n",
+ " 43348 | \n",
+ " 101 | \n",
+ " 39774 | \n",
+ " 11127 | \n",
+ " 45989 | \n",
+ " 24596 | \n",
+ " 11933 | \n",
+ " 170 | \n",
+ " 17145 | \n",
+ " 10710 | \n",
+ " 39125 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " CHP sandıkları bırakmaz, üzerine oturur, bir c... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
41376 rows × 66 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 7 8 \\\n",
+ "0 101 10110 175 78653 189 25285 15976 40840 276 \n",
+ "1 101 11589 10706 10713 10794 94698 30668 24883 117 \n",
+ "2 101 148 30471 10774 13785 13779 33642 14399 48271 \n",
+ "3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n",
+ "4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n",
+ "... ... ... ... ... ... ... ... ... ... \n",
+ "43344 101 20065 10161 115 115 103784 10774 21388 10245 \n",
+ "43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n",
+ "43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n",
+ "43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n",
+ "43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n",
+ "\n",
+ " 9 ... 56 57 58 59 60 61 62 63 labels \\\n",
+ "0 31623 ... 0 0 0 0 0 0 0 0 0 \n",
+ "1 23763 ... 0 0 0 0 0 0 0 0 0 \n",
+ "2 76686 ... 0 0 0 0 0 0 0 0 0 \n",
+ "3 10147 ... 0 0 0 0 0 0 0 0 0 \n",
+ "4 13878 ... 0 0 0 0 0 0 0 0 0 \n",
+ "... ... ... .. .. .. .. .. .. .. .. ... \n",
+ "43344 92067 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43345 276 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43346 10561 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43347 26083 ... 0 0 0 0 0 0 0 0 1 \n",
+ "43348 39125 ... 0 0 0 0 0 0 0 0 1 \n",
+ "\n",
+ " tweet \n",
+ "0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n",
+ "1 Mekanı cennet olsun, saygılar sayın avukatımı... \n",
+ "2 Kızlar aranızda kas yığını beylere düşenler ol... \n",
+ "3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n",
+ "4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n",
+ "... ... \n",
+ "43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n",
+ "43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n",
+ "43346 Turgay denilen bu holigonda bir sorun yok, gur... \n",
+ "43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n",
+ "43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n",
+ "\n",
+ "[41376 rows x 66 columns]"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "embeding_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df=pd.DataFrame()\n",
+ "df['tweet']=embeding_df['tweet']\n",
+ "df['subtas_a']=embeding_df['labels']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_csv('knn_outliers.csv') "
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# pca"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " ... | \n",
+ " 57 | \n",
+ " 58 | \n",
+ " 59 | \n",
+ " 60 | \n",
+ " 61 | \n",
+ " 62 | \n",
+ " 63 | \n",
+ " labels | \n",
+ " Anomaly | \n",
+ " Anomaly_Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101.0 | \n",
+ " 10110.0 | \n",
+ " 175.0 | \n",
+ " 78653.0 | \n",
+ " 189.0 | \n",
+ " 25285.0 | \n",
+ " 15976.0 | \n",
+ " 40840.0 | \n",
+ " 276.0 | \n",
+ " 31623.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 1.354399e+32 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 101.0 | \n",
+ " 11589.0 | \n",
+ " 10706.0 | \n",
+ " 10713.0 | \n",
+ " 10794.0 | \n",
+ " 94698.0 | \n",
+ " 30668.0 | \n",
+ " 24883.0 | \n",
+ " 117.0 | \n",
+ " 23763.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 1.311723e+32 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 101.0 | \n",
+ " 148.0 | \n",
+ " 30471.0 | \n",
+ " 10774.0 | \n",
+ " 13785.0 | \n",
+ " 13779.0 | \n",
+ " 33642.0 | \n",
+ " 14399.0 | \n",
+ " 48271.0 | \n",
+ " 76686.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 1.597792e+32 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 101.0 | \n",
+ " 19319.0 | \n",
+ " 16724.0 | \n",
+ " 10118.0 | \n",
+ " 10107.0 | \n",
+ " 78323.0 | \n",
+ " 12407.0 | \n",
+ " 38959.0 | \n",
+ " 22934.0 | \n",
+ " 10147.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 1.551488e+32 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 101.0 | \n",
+ " 30932.0 | \n",
+ " 58706.0 | \n",
+ " 58054.0 | \n",
+ " 44907.0 | \n",
+ " 10224.0 | \n",
+ " 106583.0 | \n",
+ " 10288.0 | \n",
+ " 12524.0 | \n",
+ " 13878.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 1.348867e+32 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 43344 | \n",
+ " 101.0 | \n",
+ " 20065.0 | \n",
+ " 10161.0 | \n",
+ " 115.0 | \n",
+ " 115.0 | \n",
+ " 103784.0 | \n",
+ " 10774.0 | \n",
+ " 21388.0 | \n",
+ " 10245.0 | \n",
+ " 92067.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 2.346619e+32 | \n",
+ "
\n",
+ " \n",
+ " 43345 | \n",
+ " 101.0 | \n",
+ " 139.0 | \n",
+ " 80839.0 | \n",
+ " 24109.0 | \n",
+ " 13406.0 | \n",
+ " 18985.0 | \n",
+ " 16285.0 | \n",
+ " 10163.0 | \n",
+ " 11062.0 | \n",
+ " 276.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 1.778253e+32 | \n",
+ "
\n",
+ " \n",
+ " 43346 | \n",
+ " 101.0 | \n",
+ " 105549.0 | \n",
+ " 102635.0 | \n",
+ " 10140.0 | \n",
+ " 26943.0 | \n",
+ " 11499.0 | \n",
+ " 110516.0 | \n",
+ " 21899.0 | \n",
+ " 11861.0 | \n",
+ " 10561.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 1.762300e+32 | \n",
+ "
\n",
+ " \n",
+ " 43347 | \n",
+ " 101.0 | \n",
+ " 81424.0 | \n",
+ " 26398.0 | \n",
+ " 92017.0 | \n",
+ " 109620.0 | \n",
+ " 10941.0 | \n",
+ " 76010.0 | \n",
+ " 10115.0 | \n",
+ " 19830.0 | \n",
+ " 26083.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 1.564075e+32 | \n",
+ "
\n",
+ " \n",
+ " 43348 | \n",
+ " 101.0 | \n",
+ " 39774.0 | \n",
+ " 11127.0 | \n",
+ " 45989.0 | \n",
+ " 24596.0 | \n",
+ " 11933.0 | \n",
+ " 170.0 | \n",
+ " 17145.0 | \n",
+ " 10710.0 | \n",
+ " 39125.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 2.685411e+32 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
43349 rows × 67 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 101.0 10110.0 175.0 78653.0 189.0 25285.0 15976.0 \n",
+ "1 101.0 11589.0 10706.0 10713.0 10794.0 94698.0 30668.0 \n",
+ "2 101.0 148.0 30471.0 10774.0 13785.0 13779.0 33642.0 \n",
+ "3 101.0 19319.0 16724.0 10118.0 10107.0 78323.0 12407.0 \n",
+ "4 101.0 30932.0 58706.0 58054.0 44907.0 10224.0 106583.0 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "43344 101.0 20065.0 10161.0 115.0 115.0 103784.0 10774.0 \n",
+ "43345 101.0 139.0 80839.0 24109.0 13406.0 18985.0 16285.0 \n",
+ "43346 101.0 105549.0 102635.0 10140.0 26943.0 11499.0 110516.0 \n",
+ "43347 101.0 81424.0 26398.0 92017.0 109620.0 10941.0 76010.0 \n",
+ "43348 101.0 39774.0 11127.0 45989.0 24596.0 11933.0 170.0 \n",
+ "\n",
+ " 7 8 9 ... 57 58 59 60 61 62 63 \\\n",
+ "0 40840.0 276.0 31623.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "1 24883.0 117.0 23763.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "2 14399.0 48271.0 76686.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "3 38959.0 22934.0 10147.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "4 10288.0 12524.0 13878.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "... ... ... ... ... ... ... ... ... ... ... ... \n",
+ "43344 21388.0 10245.0 92067.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43345 10163.0 11062.0 276.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43346 21899.0 11861.0 10561.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43347 10115.0 19830.0 26083.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43348 17145.0 10710.0 39125.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " labels Anomaly Anomaly_Score \n",
+ "0 0.0 0 1.354399e+32 \n",
+ "1 0.0 0 1.311723e+32 \n",
+ "2 0.0 0 1.597792e+32 \n",
+ "3 0.0 0 1.551488e+32 \n",
+ "4 0.0 0 1.348867e+32 \n",
+ "... ... ... ... \n",
+ "43344 1.0 0 2.346619e+32 \n",
+ "43345 1.0 0 1.778253e+32 \n",
+ "43346 1.0 0 1.762300e+32 \n",
+ "43347 1.0 0 1.564075e+32 \n",
+ "43348 1.0 0 2.685411e+32 \n",
+ "\n",
+ "[43349 rows x 67 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pca = create_model('pca')\n",
+ "pca_anomalies = assign_model(pca)\n",
+ "pca_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "pca_pred = predict_model(pca, data=pca_df)\n",
+ "pca_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 41181\n",
+ "1 2168\n",
+ "Name: Anomaly, dtype: int64"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pca_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "embeding_df.drop(pca_pred.loc[pca_pred['Anomaly']==1 ].index, inplace=True)\n",
+ "df=pd.DataFrame()\n",
+ "df['tweet']=embeding_df['tweet']\n",
+ "df['subtas_a']=embeding_df['labels']\n",
+ "df.to_csv('pca_outliers.csv') "
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# abod"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " ... | \n",
+ " 57 | \n",
+ " 58 | \n",
+ " 59 | \n",
+ " 60 | \n",
+ " 61 | \n",
+ " 62 | \n",
+ " 63 | \n",
+ " labels | \n",
+ " Anomaly | \n",
+ " Anomaly_Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101.0 | \n",
+ " 10110.0 | \n",
+ " 175.0 | \n",
+ " 78653.0 | \n",
+ " 189.0 | \n",
+ " 25285.0 | \n",
+ " 15976.0 | \n",
+ " 40840.0 | \n",
+ " 276.0 | \n",
+ " 31623.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " -7.719921e-22 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 101.0 | \n",
+ " 11589.0 | \n",
+ " 10706.0 | \n",
+ " 10713.0 | \n",
+ " 10794.0 | \n",
+ " 94698.0 | \n",
+ " 30668.0 | \n",
+ " 24883.0 | \n",
+ " 117.0 | \n",
+ " 23763.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " -4.030618e-21 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 101.0 | \n",
+ " 148.0 | \n",
+ " 30471.0 | \n",
+ " 10774.0 | \n",
+ " 13785.0 | \n",
+ " 13779.0 | \n",
+ " 33642.0 | \n",
+ " 14399.0 | \n",
+ " 48271.0 | \n",
+ " 76686.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " -3.558939e-22 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 101.0 | \n",
+ " 19319.0 | \n",
+ " 16724.0 | \n",
+ " 10118.0 | \n",
+ " 10107.0 | \n",
+ " 78323.0 | \n",
+ " 12407.0 | \n",
+ " 38959.0 | \n",
+ " 22934.0 | \n",
+ " 10147.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " -2.895136e-22 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 101.0 | \n",
+ " 30932.0 | \n",
+ " 58706.0 | \n",
+ " 58054.0 | \n",
+ " 44907.0 | \n",
+ " 10224.0 | \n",
+ " 106583.0 | \n",
+ " 10288.0 | \n",
+ " 12524.0 | \n",
+ " 13878.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " -4.832515e-21 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 43344 | \n",
+ " 101.0 | \n",
+ " 20065.0 | \n",
+ " 10161.0 | \n",
+ " 115.0 | \n",
+ " 115.0 | \n",
+ " 103784.0 | \n",
+ " 10774.0 | \n",
+ " 21388.0 | \n",
+ " 10245.0 | \n",
+ " 92067.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " -7.984637e-25 | \n",
+ "
\n",
+ " \n",
+ " 43345 | \n",
+ " 101.0 | \n",
+ " 139.0 | \n",
+ " 80839.0 | \n",
+ " 24109.0 | \n",
+ " 13406.0 | \n",
+ " 18985.0 | \n",
+ " 16285.0 | \n",
+ " 10163.0 | \n",
+ " 11062.0 | \n",
+ " 276.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " -1.059387e-22 | \n",
+ "
\n",
+ " \n",
+ " 43346 | \n",
+ " 101.0 | \n",
+ " 105549.0 | \n",
+ " 102635.0 | \n",
+ " 10140.0 | \n",
+ " 26943.0 | \n",
+ " 11499.0 | \n",
+ " 110516.0 | \n",
+ " 21899.0 | \n",
+ " 11861.0 | \n",
+ " 10561.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " -3.592603e-22 | \n",
+ "
\n",
+ " \n",
+ " 43347 | \n",
+ " 101.0 | \n",
+ " 81424.0 | \n",
+ " 26398.0 | \n",
+ " 92017.0 | \n",
+ " 109620.0 | \n",
+ " 10941.0 | \n",
+ " 76010.0 | \n",
+ " 10115.0 | \n",
+ " 19830.0 | \n",
+ " 26083.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " -2.226002e-21 | \n",
+ "
\n",
+ " \n",
+ " 43348 | \n",
+ " 101.0 | \n",
+ " 39774.0 | \n",
+ " 11127.0 | \n",
+ " 45989.0 | \n",
+ " 24596.0 | \n",
+ " 11933.0 | \n",
+ " 170.0 | \n",
+ " 17145.0 | \n",
+ " 10710.0 | \n",
+ " 39125.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " -2.864757e-23 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
43349 rows × 67 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 101.0 10110.0 175.0 78653.0 189.0 25285.0 15976.0 \n",
+ "1 101.0 11589.0 10706.0 10713.0 10794.0 94698.0 30668.0 \n",
+ "2 101.0 148.0 30471.0 10774.0 13785.0 13779.0 33642.0 \n",
+ "3 101.0 19319.0 16724.0 10118.0 10107.0 78323.0 12407.0 \n",
+ "4 101.0 30932.0 58706.0 58054.0 44907.0 10224.0 106583.0 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "43344 101.0 20065.0 10161.0 115.0 115.0 103784.0 10774.0 \n",
+ "43345 101.0 139.0 80839.0 24109.0 13406.0 18985.0 16285.0 \n",
+ "43346 101.0 105549.0 102635.0 10140.0 26943.0 11499.0 110516.0 \n",
+ "43347 101.0 81424.0 26398.0 92017.0 109620.0 10941.0 76010.0 \n",
+ "43348 101.0 39774.0 11127.0 45989.0 24596.0 11933.0 170.0 \n",
+ "\n",
+ " 7 8 9 ... 57 58 59 60 61 62 63 \\\n",
+ "0 40840.0 276.0 31623.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "1 24883.0 117.0 23763.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "2 14399.0 48271.0 76686.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "3 38959.0 22934.0 10147.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "4 10288.0 12524.0 13878.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "... ... ... ... ... ... ... ... ... ... ... ... \n",
+ "43344 21388.0 10245.0 92067.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43345 10163.0 11062.0 276.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43346 21899.0 11861.0 10561.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43347 10115.0 19830.0 26083.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43348 17145.0 10710.0 39125.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " labels Anomaly Anomaly_Score \n",
+ "0 0.0 0 -7.719921e-22 \n",
+ "1 0.0 0 -4.030618e-21 \n",
+ "2 0.0 0 -3.558939e-22 \n",
+ "3 0.0 0 -2.895136e-22 \n",
+ "4 0.0 0 -4.832515e-21 \n",
+ "... ... ... ... \n",
+ "43344 1.0 0 -7.984637e-25 \n",
+ "43345 1.0 0 -1.059387e-22 \n",
+ "43346 1.0 0 -3.592603e-22 \n",
+ "43347 1.0 0 -2.226002e-21 \n",
+ "43348 1.0 0 -2.864757e-23 \n",
+ "\n",
+ "[43349 rows x 67 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "abod = create_model('abod')\n",
+ "abod_anomalies = assign_model(abod)\n",
+ "abod_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "abod_pred = predict_model(abod, data=abod_df)\n",
+ "abod_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 43349\n",
+ "Name: Anomaly, dtype: int64"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "abod_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# cluster"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Initiated | \n",
+ " . . . . . . . . . . . . . . . . . . | \n",
+ " 16:19:48 | \n",
+ "
\n",
+ " \n",
+ " Status | \n",
+ " . . . . . . . . . . . . . . . . . . | \n",
+ " Fitting 0.05 Fraction | \n",
+ "
\n",
+ " \n",
+ " Estimator | \n",
+ " . . . . . . . . . . . . . . . . . . | \n",
+ " Clustering-Based Local Outlier | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " \n",
+ " \n",
+ "Initiated . . . . . . . . . . . . . . . . . . 16:19:48\n",
+ "Status . . . . . . . . . . . . . . . . . . Fitting 0.05 Fraction\n",
+ "Estimator . . . . . . . . . . . . . . . . . . Clustering-Based Local Outlier"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " ... | \n",
+ " 57 | \n",
+ " 58 | \n",
+ " 59 | \n",
+ " 60 | \n",
+ " 61 | \n",
+ " 62 | \n",
+ " 63 | \n",
+ " labels | \n",
+ " Anomaly | \n",
+ " Anomaly_Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101.0 | \n",
+ " 10110.0 | \n",
+ " 175.0 | \n",
+ " 78653.0 | \n",
+ " 189.0 | \n",
+ " 25285.0 | \n",
+ " 15976.0 | \n",
+ " 40840.0 | \n",
+ " 276.0 | \n",
+ " 31623.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 123828.159076 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 101.0 | \n",
+ " 11589.0 | \n",
+ " 10706.0 | \n",
+ " 10713.0 | \n",
+ " 10794.0 | \n",
+ " 94698.0 | \n",
+ " 30668.0 | \n",
+ " 24883.0 | \n",
+ " 117.0 | \n",
+ " 23763.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 112972.396566 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 101.0 | \n",
+ " 148.0 | \n",
+ " 30471.0 | \n",
+ " 10774.0 | \n",
+ " 13785.0 | \n",
+ " 13779.0 | \n",
+ " 33642.0 | \n",
+ " 14399.0 | \n",
+ " 48271.0 | \n",
+ " 76686.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 145701.165368 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 101.0 | \n",
+ " 19319.0 | \n",
+ " 16724.0 | \n",
+ " 10118.0 | \n",
+ " 10107.0 | \n",
+ " 78323.0 | \n",
+ " 12407.0 | \n",
+ " 38959.0 | \n",
+ " 22934.0 | \n",
+ " 10147.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 141686.216880 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 101.0 | \n",
+ " 30932.0 | \n",
+ " 58706.0 | \n",
+ " 58054.0 | \n",
+ " 44907.0 | \n",
+ " 10224.0 | \n",
+ " 106583.0 | \n",
+ " 10288.0 | \n",
+ " 12524.0 | \n",
+ " 13878.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 101399.757887 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 43344 | \n",
+ " 101.0 | \n",
+ " 20065.0 | \n",
+ " 10161.0 | \n",
+ " 115.0 | \n",
+ " 115.0 | \n",
+ " 103784.0 | \n",
+ " 10774.0 | \n",
+ " 21388.0 | \n",
+ " 10245.0 | \n",
+ " 92067.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 193403.127721 | \n",
+ "
\n",
+ " \n",
+ " 43345 | \n",
+ " 101.0 | \n",
+ " 139.0 | \n",
+ " 80839.0 | \n",
+ " 24109.0 | \n",
+ " 13406.0 | \n",
+ " 18985.0 | \n",
+ " 16285.0 | \n",
+ " 10163.0 | \n",
+ " 11062.0 | \n",
+ " 276.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 154821.530684 | \n",
+ "
\n",
+ " \n",
+ " 43346 | \n",
+ " 101.0 | \n",
+ " 105549.0 | \n",
+ " 102635.0 | \n",
+ " 10140.0 | \n",
+ " 26943.0 | \n",
+ " 11499.0 | \n",
+ " 110516.0 | \n",
+ " 21899.0 | \n",
+ " 11861.0 | \n",
+ " 10561.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 166024.182457 | \n",
+ "
\n",
+ " \n",
+ " 43347 | \n",
+ " 101.0 | \n",
+ " 81424.0 | \n",
+ " 26398.0 | \n",
+ " 92017.0 | \n",
+ " 109620.0 | \n",
+ " 10941.0 | \n",
+ " 76010.0 | \n",
+ " 10115.0 | \n",
+ " 19830.0 | \n",
+ " 26083.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 130852.856537 | \n",
+ "
\n",
+ " \n",
+ " 43348 | \n",
+ " 101.0 | \n",
+ " 39774.0 | \n",
+ " 11127.0 | \n",
+ " 45989.0 | \n",
+ " 24596.0 | \n",
+ " 11933.0 | \n",
+ " 170.0 | \n",
+ " 17145.0 | \n",
+ " 10710.0 | \n",
+ " 39125.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 206109.572124 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
43349 rows × 67 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 101.0 10110.0 175.0 78653.0 189.0 25285.0 15976.0 \n",
+ "1 101.0 11589.0 10706.0 10713.0 10794.0 94698.0 30668.0 \n",
+ "2 101.0 148.0 30471.0 10774.0 13785.0 13779.0 33642.0 \n",
+ "3 101.0 19319.0 16724.0 10118.0 10107.0 78323.0 12407.0 \n",
+ "4 101.0 30932.0 58706.0 58054.0 44907.0 10224.0 106583.0 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "43344 101.0 20065.0 10161.0 115.0 115.0 103784.0 10774.0 \n",
+ "43345 101.0 139.0 80839.0 24109.0 13406.0 18985.0 16285.0 \n",
+ "43346 101.0 105549.0 102635.0 10140.0 26943.0 11499.0 110516.0 \n",
+ "43347 101.0 81424.0 26398.0 92017.0 109620.0 10941.0 76010.0 \n",
+ "43348 101.0 39774.0 11127.0 45989.0 24596.0 11933.0 170.0 \n",
+ "\n",
+ " 7 8 9 ... 57 58 59 60 61 62 63 \\\n",
+ "0 40840.0 276.0 31623.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "1 24883.0 117.0 23763.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "2 14399.0 48271.0 76686.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "3 38959.0 22934.0 10147.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "4 10288.0 12524.0 13878.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "... ... ... ... ... ... ... ... ... ... ... ... \n",
+ "43344 21388.0 10245.0 92067.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43345 10163.0 11062.0 276.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43346 21899.0 11861.0 10561.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43347 10115.0 19830.0 26083.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "43348 17145.0 10710.0 39125.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " labels Anomaly Anomaly_Score \n",
+ "0 0.0 0 123828.159076 \n",
+ "1 0.0 0 112972.396566 \n",
+ "2 0.0 0 145701.165368 \n",
+ "3 0.0 0 141686.216880 \n",
+ "4 0.0 0 101399.757887 \n",
+ "... ... ... ... \n",
+ "43344 1.0 0 193403.127721 \n",
+ "43345 1.0 0 154821.530684 \n",
+ "43346 1.0 0 166024.182457 \n",
+ "43347 1.0 0 130852.856537 \n",
+ "43348 1.0 0 206109.572124 \n",
+ "\n",
+ "[43349 rows x 67 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cluster = create_model('cluster')\n",
+ "cluster_anomalies = assign_model(cluster)\n",
+ "cluster_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "cluster_pred = predict_model(cluster, data=cluster_df)\n",
+ "cluster_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 41182\n",
+ "1 2167\n",
+ "Name: Anomaly, dtype: int64"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cluster_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "embeding_df.drop(cluster_pred.loc[cluster_pred['Anomaly']==1 ].index, inplace=True)\n",
+ "df=pd.DataFrame()\n",
+ "df['tweet']=embeding_df['tweet']\n",
+ "df['subtas_a']=embeding_df['labels']\n",
+ "df.to_csv('cluster_outliers.csv') "
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# cof"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cof = create_model('cof')\n",
+ "cof_anomalies = assign_model(cof)\n",
+ "cof_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "cof_pred = predict_model(cof, data=cof_df)\n",
+ "cof_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cof_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# histogram"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "histogram = create_model('histogram')\n",
+ "histogram_anomalies = assign_model(histogram)\n",
+ "histogram_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "histogram_pred = predict_model(histogram, data=histogram_df)\n",
+ "histogram_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "histogram_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# lof"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lof = create_model('lof')\n",
+ "lof_anomalies = assign_model(lof)\n",
+ "lof_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "lof_pred = predict_model(lof, data=lof_df)\n",
+ "lof_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lof_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# svm"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lof = create_model('lof')\n",
+ "lof_anomalies = assign_model(lof)\n",
+ "lof_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "lof_pred = predict_model(lof, data=lof_df)\n",
+ "lof_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lof_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# mcd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mcd = create_model('mcd')\n",
+ "mcd_anomalies = assign_model(mcd)\n",
+ "mcd_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "mcd_pred = predict_model(mcd, data=mcd_df)\n",
+ "mcd_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mcd_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# sod"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sod = create_model('sod')\n",
+ "sod_anomalies = assign_model(sod)\n",
+ "sod_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "sod_pred = predict_model(sod, data=sod_df)\n",
+ "sod_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sod_pred['Anomaly'].value_counts()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# sos"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sos = create_model('sos')\n",
+ "sos_anomalies = assign_model(sos)\n",
+ "sos_df=embeding_df.drop(['tweet'], axis=1)\n",
+ "sos_pred = predict_model(sos, data=sos_df)\n",
+ "sos_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sos_pred['Anomaly'].value_counts()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "dl_env",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.0"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}