{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aqGqpcYIpf_q", "outputId": "a6d87acc-4df9-4abf-973c-c791c5461af9" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading deceptive-opinion-spam-corpus.zip to /content\n", "\r 0% 0.00/456k [00:00\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
deceptivepolarity
count1600.0000001600.000000
mean0.5000000.500000
std0.5001560.500156
min0.0000000.000000
25%0.0000000.000000
50%0.5000000.500000
75%1.0000001.000000
max1.0000001.000000
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n" ], "text/plain": [ " deceptive polarity\n", "count 1600.000000 1600.000000\n", "mean 0.500000 0.500000\n", "std 0.500156 0.500156\n", "min 0.000000 0.000000\n", "25% 0.000000 0.000000\n", "50% 0.500000 0.500000\n", "75% 1.000000 1.000000\n", "max 1.000000 1.000000" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = data.sample(frac=1)\n", "df.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "gqCqI9xSriAb" }, "outputs": [], "source": [ "def create_class(c):\n", " if c['polarity'] == 1 and c['deceptive'] == 1:\n", " return [1,1]\n", " elif c['polarity'] == 1 and c['deceptive'] == 0:\n", " return [1,0]\n", " elif c['polarity'] == 0 and c['deceptive'] == 1:\n", " return [0,1]\n", " else:\n", " return [0,0]\n", "\n", "def specific_class(c):\n", " if c['polarity'] == 1 and c['deceptive'] == 1: # Actually Deceptive ---> 0\n", " return \"TRUE_POSITIVE\"\n", " elif c['polarity'] == 1 and c['deceptive'] == 0: # Actually Not Deceptive ---> 1\n", " return \"FALSE_POSITIVE\"\n", " elif c['polarity'] == 0 and c['deceptive'] == 1: # Actually Not Deceptive ---> 2\n", " return \"TRUE_NEGATIVE\"\n", " else: # Actually Deceptive ---> 3\n", " return \"FALSE_NEGATIVE\"\n", "\n", "data['final_class'] = data.apply(create_class, axis=1)\n", "data['given_class'] = data.apply(specific_class, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0KtN7332rkOJ" }, "outputs": [], "source": [ "from sklearn import preprocessing\n", "\n", "label_encoder = preprocessing.LabelEncoder()\n", "\n", "data['given_class'] = label_encoder.fit_transform(data['given_class'])" ] }, { "cell_type": "code", "execution_count": 251, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MW0O6_v3EM2G", "outputId": "ab5238a9-8afb-4af5-8c49-69bab79c8caa" }, "outputs": [ { "data": { "text/plain": [ "0 [1, 1]\n", "1 [1, 1]\n", "2 [1, 1]\n", "3 [1, 1]\n", "4 [1, 1]\n", " ... \n", "1595 [0, 0]\n", "1596 [0, 0]\n", "1597 [0, 0]\n", "1598 [0, 0]\n", "1599 [0, 0]\n", "Name: final_class, Length: 1600, dtype: object" ] }, "execution_count": 251, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['final_class']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "F24YJHdermPy" }, "outputs": [], "source": [ "Y = data['given_class']\n", "encoder = LabelEncoder()\n", "encoder.fit(Y)\n", "encoded_Y = encoder.transform(Y)\n", "dummy_y = to_categorical(encoded_Y)" ] }, { "cell_type": "code", "execution_count": 247, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wmCJ8i83D1x4", "outputId": "89d49003-3a75-421d-884d-e0d032cab6ca" }, "outputs": [ { "data": { "text/plain": [ "array([[0., 0., 0., 1.],\n", " [0., 0., 0., 1.],\n", " [0., 0., 0., 1.],\n", " ...,\n", " [1., 0., 0., 0.],\n", " [1., 0., 0., 0.],\n", " [1., 0., 0., 0.]], dtype=float32)" ] }, "execution_count": 247, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dummy_y" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "U0OJ5Qhbrocj" }, "outputs": [], "source": [ "textData = pd.DataFrame(list(data['text']))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "c_WdJiovrwpN" }, "outputs": [], "source": [ "stemmer = snowballstemmer.EnglishStemmer()\n", "\n", "stop = stopwords.words('english')\n", "stop.extend(['may','also','zero','one','two','three','four','five','six','seven','eight','nine','ten','across','among','beside','however','yet','within']+list(ascii_lowercase))\n", "stoplist = stemmer.stemWords(stop)\n", "stoplist = set(stoplist)\n", "stop = set(sorted(stop + list(stoplist)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pUJtMqjZryE9", "outputId": "295b19f7-94fa-447b-b964-4017b0593401" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Unzipping corpora/stopwords.zip.\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download('stopwords')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_FG8kbvgr9HS" }, "outputs": [], "source": [ "textData[0].replace('[!\"#%\\'()*+,-./:;<=>?@\\[\\]^_`{|}~1234567890’”“′‘\\\\\\]',' ',inplace=True,regex=True)\n", "wordlist = filter(None, \" \".join(list(set(list(itertools.chain(*textData[0].str.split(' ')))))).split(\" \"))\n", "data['stemmed_text_data'] = [' '.join(filter(None,filter(lambda word: word not in stop, line))) for line in textData[0].str.lower().str.split(' ')]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fjUfTsjEsDh2" }, "outputs": [], "source": [ "minimum_count = 1\n", "str_frequencies = pd.DataFrame(list(Counter(filter(None,list(itertools.chain(*data['stemmed_text_data'].str.split(' '))))).items()),columns=['word','count'])\n", "low_frequency_words = set(str_frequencies[str_frequencies['count'] < minimum_count]['word'])\n", "data['stemmed_text_data'] = [' '.join(filter(None,filter(lambda word: word not in low_frequency_words, line))) for line in data['stemmed_text_data'].str.split(' ')]\n", "data['stemmed_text_data'] = [\" \".join(stemmer.stemWords(re.sub('[!\"#%\\'()*+,-./:;<=>?@\\[\\]^_`{|}~1234567890’”“′‘\\\\\\]',' ', next_text).split(' '))) for next_text in data['stemmed_text_data']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GX-Gd8M6sEpp" }, "outputs": [], "source": [ "lmtzr = WordNetLemmatizer()\n", "w = re.compile(\"\\w+\",re.I)\n", "\n", "def label_sentences(df, input_point):\n", " labeled_sentences = []\n", " list_sen = []\n", " for index, datapoint in df.iterrows():\n", " tokenized_words = re.findall(w,datapoint[input_point].lower())\n", " labeled_sentences.append(TaggedDocument(words=tokenized_words, tags=['SENT_%s' %index]))\n", " list_sen.append(tokenized_words)\n", " return labeled_sentences, list_sen\n", "\n", "def train_doc2vec_model(labeled_sentences):\n", " model = Doc2Vec(min_count=1, window=9, vector_size=512, sample=1e-4, negative=5, workers=7)\n", " model.build_vocab(labeled_sentences)\n", " pretrained_weights = model.wv.vectors\n", " vocab_size, embedding_size = pretrained_weights.shape\n", " model.train(labeled_sentences, total_examples=vocab_size, epochs=400)\n", "\n", " return model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2C_5s3UOsGfU" }, "outputs": [], "source": [ "textData = data['stemmed_text_data'].to_frame().reset_index()\n", "sen, corpus = label_sentences(textData, 'stemmed_text_data')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qpb3aMvW_jdj" }, "outputs": [], "source": [ "sen" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JeK3t6_HsNv9" }, "outputs": [], "source": [ "doc2vec_model = train_doc2vec_model(sen)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DyX1XG1usQMm" }, "outputs": [], "source": [ "doc2vec_model.save(\"doc2vec_model_opinion_corpus.d2v\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "l0OAFencszum" }, "outputs": [], "source": [ "doc2vec_model = Doc2Vec.load(\"doc2vec_model_opinion_corpus.d2v\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ZJTp1POIs1sQ", "outputId": "5788b8b8-3f66-4d4a-e22a-6bff020adf1e" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", " warnings.warn(\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.decomposition import TruncatedSVD\n", "\n", "tfidf1 = TfidfVectorizer(tokenizer=lambda i:i, lowercase=False, ngram_range=(1,1))\n", "result_train1 = tfidf1.fit_transform(corpus)\n", "\n", "tfidf2 = TfidfVectorizer(tokenizer=lambda i:i, lowercase=False, ngram_range=(1,2))\n", "result_train2 = tfidf2.fit_transform(corpus)\n", "\n", "tfidf3 = TfidfVectorizer(tokenizer=lambda i:i, lowercase=False, ngram_range=(1,3))\n", "result_train3 = tfidf3.fit_transform(corpus)\n", "\n", "svd = TruncatedSVD(n_components=512, n_iter=40, random_state=34)\n", "tfidf_data1 = svd.fit_transform(result_train1)\n", "tfidf_data2 = svd.fit_transform(result_train2)\n", "tfidf_data3 = svd.fit_transform(result_train3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "io0D71F00Wv8" }, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QR6PqZREs3EA" }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "import spacy\n", "\n", "nlp = spacy.load(\"en_core_web_sm\")\n", "temp_textData = pd.DataFrame(list(data['text']))\n", "\n", "overall_pos_tags_tokens = []\n", "overall_pos = []\n", "overall_tokens = []\n", "overall_dep = []\n", "\n", "for i in range(1600):\n", " doc = nlp(temp_textData[0][i])\n", " given_pos_tags_tokens = []\n", " given_pos = []\n", " given_tokens = []\n", " given_dep = []\n", " for token in doc:\n", " output = \"%s_%s\" % (token.pos_, token.tag_)\n", " given_pos_tags_tokens.append(output)\n", " given_pos.append(token.pos_)\n", " given_tokens.append(token.tag_)\n", " given_dep.append(token.dep_)\n", "\n", " overall_pos_tags_tokens.append(given_pos_tags_tokens)\n", " overall_pos.append(given_pos)\n", " overall_tokens.append(given_tokens)\n", " overall_dep.append(given_dep)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "O6C2OJ8KEzHk" }, "outputs": [], "source": [ "overall_tokens" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4PxkBoQgs4wV", "outputId": "07f59ab7-dbc0-4b41-e712-31a165e6ddf2" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", " warnings.warn(\n" ] } ], "source": [ "import numpy as np\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "count = CountVectorizer(tokenizer=lambda i: i, lowercase=False)\n", "pos_tags_data = count.fit_transform(overall_pos_tags_tokens).todense()\n", "pos_data = count.fit_transform(overall_pos).todense()\n", "tokens_data = count.fit_transform(overall_tokens).todense()\n", "dep_data = count.fit_transform(overall_dep).todense()\n", "\n", "min_max_scaler = MinMaxScaler()\n", "\n", "normalized_pos_tags_data = min_max_scaler.fit_transform(np.asarray(pos_tags_data))\n", "normalized_pos_data = min_max_scaler.fit_transform(np.asarray(pos_data))\n", "normalized_tokens_data = min_max_scaler.fit_transform(np.asarray(tokens_data))\n", "normalized_dep_data = min_max_scaler.fit_transform(np.asarray(dep_data))\n", "\n", "# Convert the scaled data to numpy arrays\n", "normalized_pos_tags_data = np.asarray(normalized_pos_tags_data)\n", "normalized_pos_data = np.asarray(normalized_pos_data)\n", "normalized_tokens_data = np.asarray(normalized_tokens_data)\n", "normalized_dep_data = np.asarray(normalized_dep_data)\n", "\n", "final_pos_tags_data = np.zeros(shape=(1600, 512)).astype(np.float32)\n", "final_pos_data = np.zeros(shape=(1600, 512)).astype(np.float32)\n", "final_tokens_data = np.zeros(shape=(1600, 512)).astype(np.float32)\n", "final_dep_data = np.zeros(shape=(1600, 512)).astype(np.float32)\n", "\n", "# Assign the converted arrays to the final arrays\n", "final_pos_tags_data[:normalized_pos_tags_data.shape[0], :normalized_pos_tags_data.shape[1]] = normalized_pos_tags_data\n", "final_pos_data[:normalized_pos_data.shape[0], :normalized_pos_data.shape[1]] = normalized_pos_data\n", "final_tokens_data[:normalized_tokens_data.shape[0], :normalized_tokens_data.shape[1]] = normalized_tokens_data\n", "final_dep_data[:normalized_dep_data.shape[0], :normalized_dep_data.shape[1]] = normalized_dep_data\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jQdeLCgas6HD", "outputId": "b1079e7c-c4fa-413d-bf89-9f5dc8fe36d8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "370\n" ] } ], "source": [ "maxlength = []\n", "for i in range(0,len(sen)):\n", " maxlength.append(len(sen[i][0]))\n", "\n", "print(max(maxlength))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "X5y1kjW-s7bJ" }, "outputs": [], "source": [ "doc2vec_model = Doc2Vec.load(\"doc2vec_model_opinion_corpus.d2v\")\n", "\n", "def vectorize_comments(df,d2v_model):\n", " y = []\n", " comments = []\n", " for i in range(0,df.shape[0]):\n", " label = 'SENT_%s' %i\n", " comments.append(d2v_model.docvecs[label])\n", " df['vectorized_comments'] = comments\n", "\n", " return df\n", "\n", "textData = vectorize_comments(textData,doc2vec_model)\n", "print (textData.head(2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SUfiSDENs8cg" }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import cross_validate,GridSearchCV\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(textData[\"vectorized_comments\"].T.tolist(),\n", " dummy_y,\n", " test_size=0.1,\n", " random_state=56)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1ID0T5d0s-iS" }, "outputs": [], "source": [ "X = np.array(textData[\"vectorized_comments\"].T.tolist()).reshape((1,1600,512))\n", "y = np.array(dummy_y).reshape((1600,4))\n", "X_train2 = np.array(X_train).reshape((1,1440,512))\n", "y_train2 = np.array(y_train).reshape((1,1440,4))\n", "X_test2 = np.array(X_test).reshape((1,160,512))\n", "y_test2 = np.array(y_test).reshape((1,160,4))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GlYOfPhGs_lR" }, "outputs": [], "source": [ "from sklearn.model_selection import StratifiedKFold\n", "Xtemp = df[\"vectorized_comments\"].T.tolist()\n", "ytemp = data['given_class']\n", "training_indices = []\n", "testing_indices = []\n", "\n", "skf = StratifiedKFold(n_splits=10)\n", "skf.get_n_splits(Xtemp, ytemp)\n", "\n", "for train_index, test_index in skf.split(Xtemp, ytemp):\n", " training_indices.append(train_index)\n", " testing_indices.append(test_index)" ] }, { "cell_type": "code", "execution_count": 238, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-avX-WdT_Z2P", "outputId": "a2a227a5-9ff7-4ee0-8cd0-d6b8fb157363" }, "outputs": [ { "data": { "text/plain": [ "160" ] }, "execution_count": 238, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(testing_indices[2])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "EIVEsyLiRjn7" }, "outputs": [], "source": [ "training_indices" ] }, { "cell_type": "code", "execution_count": 211, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "iPc15Lwv7-L2", "outputId": "5b5578e0-27b7-4bad-92e7-ef42dda066ed" }, "outputs": [ { "data": { "text/plain": [ "<12x12 sparse matrix of type ''\n", "\twith 20 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 211, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result_train1" ] }, { "cell_type": "code", "execution_count": 212, "metadata": { "id": "8UwiqmnhtAz6" }, "outputs": [], "source": [ "def extractTrainingAndTestingData(givenIndex):\n", " X_train3 = np.zeros(shape=(1440, max(maxlength)+10, 512)).astype(np.float32)\n", " Y_train3 = np.zeros(shape=(1440, 4)).astype(np.float32)\n", " X_test3 = np.zeros(shape=(160, max(maxlength)+10, 512)).astype(np.float32)\n", " Y_test3 = np.zeros(shape=(160, 4)).astype(np.float32)\n", "\n", " empty_word = np.zeros(512).astype(np.float32)\n", "\n", " count_i = 0\n", " for i in training_indices[givenIndex]:\n", " len1 = len(sen[i][0])\n", " average_vector1 = np.zeros(512).astype(np.float32)\n", " average_vector2 = np.zeros(512).astype(np.float32)\n", " average_vector3 = np.zeros(512).astype(np.float32)\n", " for j in range(max(maxlength)+10):\n", " if j < len1:\n", " X_train3[count_i,j,:] = doc2vec_model[sen[i][0][j]]\n", " average_vector1 += result_train1[i, tfidf1.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]\n", " average_vector2 += result_train2[i, tfidf2.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]\n", " average_vector3 += result_train3[i, tfidf3.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]\n", " #elif j >= len1 and j < len1 + 379:\n", " # X_train3[count_i,j,:] = glove_data[i, j-len1, :]\n", " elif j == len1:\n", " X_train3[count_i,j,:] = tfidf_data1[i]\n", " elif j == len1 + 1:\n", " X_train3[count_i,j,:] = tfidf_data2[i]\n", " elif j == len1+2:\n", " X_train3[count_i,j,:] = tfidf_data3[i]\n", " elif j == len1+3:\n", " X_train3[count_i,j,:] = average_vector1\n", " elif j == len1+4:\n", " X_train3[count_i,j,:] = average_vector2\n", " elif j == len1+5:\n", " X_train3[count_i,j,:] = average_vector3\n", " elif j == len1+6:\n", " X_train3[count_i,j,:] = final_pos_tags_data[i]\n", " elif j == len1+7:\n", " X_train3[count_i,j,:] = final_pos_data[i]\n", " elif j == len1+8:\n", " X_train3[count_i,j,:] = final_tokens_data[i]\n", " elif j == len1+9:\n", " X_train3[count_i,j,:] = final_dep_data[i]\n", " else:\n", " X_train3[count_i,j,:] = empty_word\n", "\n", " Y_train3[count_i,:] = dummy_y[i]\n", " count_i += 1\n", "\n", "\n", " count_i = 0\n", " for i in testing_indices[givenIndex]:\n", " len1 = len(sen[i][0])\n", " average_vector1 = np.zeros(512).astype(np.float32)\n", " average_vector2 = np.zeros(512).astype(np.float32)\n", " average_vector3 = np.zeros(512).astype(np.float32)\n", " for j in range(max(maxlength)+10):\n", " if j < len1:\n", " X_test3[count_i,j,:] = doc2vec_model[sen[i][0][j]]\n", " average_vector1 += result_train1[i, tfidf1.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]\n", " average_vector2 += result_train2[i, tfidf2.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]\n", " average_vector3 += result_train3[i, tfidf3.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]\n", " #elif j >= len1 and j < len1 + 379:\n", " # X_test3[count_i,j,:] = glove_data[i, j-len1, :]\n", " elif j == len1:\n", " X_test3[count_i,j,:] = tfidf_data1[i]\n", " elif j == len1 + 1:\n", " X_test3[count_i,j,:] = tfidf_data2[i]\n", " elif j == len1+2:\n", " X_test3[count_i,j,:] = tfidf_data3[i]\n", " elif j == len1+3:\n", " X_test3[count_i,j,:] = average_vector1\n", " elif j == len1+4:\n", " X_test3[count_i,j,:] = average_vector2\n", " elif j == len1+5:\n", " X_test3[count_i,j,:] = average_vector3\n", " elif j == len1+6:\n", " X_test3[count_i,j,:] = final_pos_tags_data[i]\n", " elif j == len1+7:\n", " X_test3[count_i,j,:] = final_pos_data[i]\n", " elif j == len1+8:\n", " X_test3[count_i,j,:] = final_tokens_data[i]\n", " elif j == len1+9:\n", " X_test3[count_i,j,:] = final_dep_data[i]\n", " else:\n", " X_test3[count_i,j,:] = empty_word\n", "\n", " Y_test3[count_i,:] = dummy_y[i]\n", " count_i += 1\n", "\n", " return X_train3, X_test3, Y_train3, Y_test3" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_ZQ6S5IhtB_8" }, "outputs": [], "source": [ "model = Sequential()\n", "model.add(Conv1D(filters=128, kernel_size=9, padding='same', activation='relu', input_shape=(max(maxlength)+10,512)))\n", "model.add(Dropout(0.25))\n", "model.add(MaxPooling1D(pool_size=2))\n", "model.add(Dropout(0.25))\n", "model.add(Conv1D(filters=128, kernel_size=7, padding='same', activation='relu'))\n", "model.add(Dropout(0.25))\n", "model.add(MaxPooling1D(pool_size=2))\n", "model.add(Dropout(0.25))\n", "model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))\n", "model.add(Dropout(0.25))\n", "model.add(Bidirectional(LSTM(50, dropout=0.25, recurrent_dropout=0.2)))\n", "model.add(Dense(4, activation='softmax'))\n", "model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])\n", "print(model.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "G2XuZvBOtDMs" }, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "from keras.callbacks import ModelCheckpoint\n", "\n", "final_accuracies = []\n", "\n", "filename = 'weights.best.from_scratch%s.hdf5' % 9\n", "checkpointer = ModelCheckpoint(filepath=filename, verbose=1, save_best_only=True)\n", "X_train3, X_test3, Y_train3, Y_test3 = extractTrainingAndTestingData(9)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kGGf09dktEmS" }, "outputs": [], "source": [ "history = model.fit(X_train3, Y_train3, epochs=15, batch_size=512, callbacks=[checkpointer], validation_data=(X_test3, Y_test3), verbose=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ug5x2h7xtGNb" }, "outputs": [], "source": [ "model.evaluate(X_test3, Y_test3)" ] }, { "cell_type": "code", "execution_count": 207, "metadata": { "id": "UVKUsrk0tHil" }, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "BPcCy47XtKcL" }, "outputs": [], "source": [ "model.load_weights(filename)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "csp-z21UtLUT" }, "outputs": [], "source": [ "for i in range(10):\n", " filename = 'weights.best.from_scratch%s.hdf5' % i\n", " checkpointer = ModelCheckpoint(filepath=filename, verbose=1, save_best_only=True)\n", " X_train3, X_test3, Y_train3, Y_test3 = extractTrainingAndTestingData(i)\n", " model.fit(X_train3, Y_train3, epochs=10, batch_size=512, callbacks=[checkpointer], validation_data=(X_test3, Y_test3))\n", " model.load_weights(filename)\n", " predicted = np.rint(model.predict(X_test3))\n", " final_accuracies.append(accuracy_score(Y_test3, predicted))\n", " print(accuracy_score(Y_test3, predicted))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aKhWVJjFLE_9", "outputId": "0a8053a8-3bae-46b1-b154-31fd2030465b" }, "outputs": [ { "data": { "text/plain": [ "380" ] }, "execution_count": 162, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(X_test3[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "73d3h_lhL5r8", "outputId": "8da114d8-2901-49b2-fbcd-90b821b392ad" }, "outputs": [ { "data": { "text/plain": [ "160" ] }, "execution_count": 161, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(Y_test3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EGeLg-HXtMlu", "outputId": "c41250cd-b6e6-4b92-a775-d010fbdc803a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.8875\n" ] } ], "source": [ "print(sum(final_accuracies) / len(final_accuracies))" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }