{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aqGqpcYIpf_q", "outputId": "a6d87acc-4df9-4abf-973c-c791c5461af9" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading deceptive-opinion-spam-corpus.zip to /content\n", "\r 0% 0.00/456k [00:00, ?B/s]\n", "\r100% 456k/456k [00:00<00:00, 111MB/s]\n" ] } ], "source": [ "!kaggle datasets download -d rtatman/deceptive-opinion-spam-corpus" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zu4NTMHapmms" }, "outputs": [], "source": [ "import zipfile\n", "zip_ref = zipfile.ZipFile('/content/deceptive-opinion-spam-corpus.zip', 'r')\n", "zip_ref.extractall('/content')\n", "zip_ref.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3hIl0gHep9q6" }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from keras.preprocessing import sequence\n", "from keras.layers import TimeDistributed, GlobalAveragePooling1D, GlobalAveragePooling2D, BatchNormalization\n", "from keras.layers import LSTM\n", "from keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D, AveragePooling1D\n", "from keras.layers import Embedding\n", "from keras.layers import Dropout, Flatten, Bidirectional, Dense, Activation, TimeDistributed\n", "from keras.models import Model, Sequential\n", "from tensorflow.keras.utils import to_categorical\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize, sent_tokenize\n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "from string import ascii_lowercase\n", "from collections import Counter\n", "from gensim.models import Word2Vec\n", "from gensim.models import Doc2Vec\n", "from gensim.models import doc2vec\n", "from gensim.models import KeyedVectors\n", "import itertools, nltk, snowballstemmer, re\n", "import random\n", "\n", "TaggedDocument = doc2vec.TaggedDocument" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fzwpJb7EqCjc" }, "outputs": [], "source": [ "class LabeledLineSentence(object):\n", " def __init__(self, sources):\n", " self.sources = sources\n", "\n", " flipped = {}\n", "\n", " for key, value in sources.items():\n", " if value not in flipped:\n", " flipped[value] = [key]\n", " else:\n", " raise Exception('Non-unique prefix encountered')\n", "\n", " def __iter__(self):\n", " for source, prefix in self.sources.items():\n", " with utils.smart_open(source) as fin:\n", " for item_no, line in enumerate(fin):\n", " yield TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])\n", "\n", " def to_array(self):\n", " self.sentences = []\n", " for source, prefix in self.sources.items():\n", " with utils.smart_open(source) as fin:\n", " for item_no, line in enumerate(fin):\n", " self.sentences.append(TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))\n", " return self.sentences\n", "\n", " def sentences_perm(self):\n", " shuffled = list(self.sentences)\n", " random.shuffle(shuffled)\n", " return shuffled" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "e_auMclPqmrI" }, "outputs": [], "source": [ "data = pd.read_csv(\"/content/deceptive-opinion.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "G-EIqcKVrbEl" }, "outputs": [], "source": [ "data['polarity'] = np.where(data['polarity']=='positive', 1, 0)\n", "data['deceptive'] = np.where(data['deceptive']=='truthful', 1, 0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "2REEjGj9rck1", "outputId": "a5679d0f-f0b9-4c21-8005-42058e2cc4fc" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " | deceptive | \n", "polarity | \n", "
---|---|---|
count | \n", "1600.000000 | \n", "1600.000000 | \n", "
mean | \n", "0.500000 | \n", "0.500000 | \n", "
std | \n", "0.500156 | \n", "0.500156 | \n", "
min | \n", "0.000000 | \n", "0.000000 | \n", "
25% | \n", "0.000000 | \n", "0.000000 | \n", "
50% | \n", "0.500000 | \n", "0.500000 | \n", "
75% | \n", "1.000000 | \n", "1.000000 | \n", "
max | \n", "1.000000 | \n", "1.000000 | \n", "