Upload 3 files

Browse files

Files changed (3) hide show

model.ipynb +444 -0
story_gen.h5 +3 -0
tokenizer.json +0 -0

model.ipynb ADDED Viewed

	@@ -0,0 +1,444 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import opendatasets as od"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading mpst-movie-plot-synopses-with-tags.zip to .\\mpst-movie-plot-synopses-with-tags\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 28.8M/28.8M [00:07<00:00, 3.81MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "od.download('https://www.kaggle.com/datasets/cryptexcode/mpst-movie-plot-synopses-with-tags')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "df = pd.read_csv('mpst-movie-plot-synopses-with-tags\\mpst_full_data.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>imdb_id</th>\n",
+       "      <th>title</th>\n",
+       "      <th>plot_synopsis</th>\n",
+       "      <th>tags</th>\n",
+       "      <th>split</th>\n",
+       "      <th>synopsis_source</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>tt0057603</td>\n",
+       "      <td>I tre volti della paura</td>\n",
+       "      <td>Note: this synopsis is for the orginal Italian...</td>\n",
+       "      <td>cult, horror, gothic, murder, atmospheric</td>\n",
+       "      <td>train</td>\n",
+       "      <td>imdb</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>tt1733125</td>\n",
+       "      <td>Dungeons &amp; Dragons: The Book of Vile Darkness</td>\n",
+       "      <td>Two thousand years ago, Nhagruul the Foul, a s...</td>\n",
+       "      <td>violence</td>\n",
+       "      <td>train</td>\n",
+       "      <td>imdb</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>tt0033045</td>\n",
+       "      <td>The Shop Around the Corner</td>\n",
+       "      <td>Matuschek's, a gift store in Budapest, is the ...</td>\n",
+       "      <td>romantic</td>\n",
+       "      <td>test</td>\n",
+       "      <td>imdb</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>tt0113862</td>\n",
+       "      <td>Mr. Holland's Opus</td>\n",
+       "      <td>Glenn Holland, not a morning person by anyone'...</td>\n",
+       "      <td>inspiring, romantic, stupid, feel-good</td>\n",
+       "      <td>train</td>\n",
+       "      <td>imdb</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>tt0086250</td>\n",
+       "      <td>Scarface</td>\n",
+       "      <td>In May 1980, a Cuban man named Tony Montana (A...</td>\n",
+       "      <td>cruelty, murder, dramatic, cult, violence, atm...</td>\n",
+       "      <td>val</td>\n",
+       "      <td>imdb</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     imdb_id                                          title  \\\n",
+       "0  tt0057603                        I tre volti della paura   \n",
+       "1  tt1733125  Dungeons & Dragons: The Book of Vile Darkness   \n",
+       "2  tt0033045                     The Shop Around the Corner   \n",
+       "3  tt0113862                             Mr. Holland's Opus   \n",
+       "4  tt0086250                                       Scarface   \n",
+       "\n",
+       "                                       plot_synopsis  \\\n",
+       "0  Note: this synopsis is for the orginal Italian...   \n",
+       "1  Two thousand years ago, Nhagruul the Foul, a s...   \n",
+       "2  Matuschek's, a gift store in Budapest, is the ...   \n",
+       "3  Glenn Holland, not a morning person by anyone'...   \n",
+       "4  In May 1980, a Cuban man named Tony Montana (A...   \n",
+       "\n",
+       "                                                tags  split synopsis_source  \n",
+       "0          cult, horror, gothic, murder, atmospheric  train            imdb  \n",
+       "1                                           violence  train            imdb  \n",
+       "2                                           romantic   test            imdb  \n",
+       "3             inspiring, romantic, stupid, feel-good  train            imdb  \n",
+       "4  cruelty, murder, dramatic, cult, violence, atm...    val            imdb  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install gpt-2-simple"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['imdb_id', 'title', 'plot_synopsis', 'tags', 'split',\n",
+       "       'synopsis_source'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
+    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import Embedding, LSTM, Dense,  Flatten\n",
+    "from sklearn.preprocessing import MultiLabelBinarizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df[['title', 'plot_synopsis', 'tags']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = Tokenizer()\n",
+    "tokenizer.fit_on_texts(df['title'])\n",
+    "title_sequences = tokenizer.texts_to_sequences(df['title'])\n",
+    "max_title_length = max(len(seq) for seq in title_sequences)\n",
+    "title_sequences = pad_sequences(title_sequences, maxlen=max_title_length)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tags = [tag.split(', ') for tag in df['tags']]\n",
+    "mlb = MultiLabelBinarizer()\n",
+    "tags = mlb.fit_transform(tags)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer_json = tokenizer.to_json()\n",
+    "with open('tokenizer.json', 'w') as json_file:\n",
+    "    json_file.write(tokenizer_json)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(title_sequences, tags, test_size=0.2, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab_size = len(tokenizer.word_index) + 1\n",
+    "embedding_dim = 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train on 11862 samples, validate on 2966 samples\n",
+      "Epoch 1/15\n",
+      "11862/11862 [==============================] - 10s 826us/sample - loss: 0.1911 - accuracy: 0.9457 - val_loss: 0.1417 - val_accuracy: 0.9569\n",
+      "Epoch 2/15\n",
+      "11862/11862 [==============================] - 11s 887us/sample - loss: 0.1390 - accuracy: 0.9583 - val_loss: 0.1416 - val_accuracy: 0.9569\n",
+      "Epoch 3/15\n",
+      "11862/11862 [==============================] - 11s 941us/sample - loss: 0.1388 - accuracy: 0.9583 - val_loss: 0.1415 - val_accuracy: 0.9569\n",
+      "Epoch 4/15\n",
+      "11862/11862 [==============================] - 11s 916us/sample - loss: 0.1367 - accuracy: 0.9583 - val_loss: 0.1420 - val_accuracy: 0.9568\n",
+      "Epoch 5/15\n",
+      "11862/11862 [==============================] - 11s 906us/sample - loss: 0.1310 - accuracy: 0.9595 - val_loss: 0.1433 - val_accuracy: 0.9567\n",
+      "Epoch 6/15\n",
+      "11862/11862 [==============================] - 11s 909us/sample - loss: 0.1248 - accuracy: 0.9608 - val_loss: 0.1444 - val_accuracy: 0.9569\n",
+      "Epoch 7/15\n",
+      "11862/11862 [==============================] - 11s 911us/sample - loss: 0.1184 - accuracy: 0.9624 - val_loss: 0.1461 - val_accuracy: 0.9564\n",
+      "Epoch 8/15\n",
+      "11862/11862 [==============================] - 11s 948us/sample - loss: 0.1123 - accuracy: 0.9649 - val_loss: 0.1484 - val_accuracy: 0.9562\n",
+      "Epoch 9/15\n",
+      "11862/11862 [==============================] - 11s 916us/sample - loss: 0.1069 - accuracy: 0.9668 - val_loss: 0.1509 - val_accuracy: 0.9552\n",
+      "Epoch 10/15\n",
+      "11862/11862 [==============================] - 11s 921us/sample - loss: 0.1021 - accuracy: 0.9682 - val_loss: 0.1537 - val_accuracy: 0.9550\n",
+      "Epoch 11/15\n",
+      "11862/11862 [==============================] - 11s 932us/sample - loss: 0.0978 - accuracy: 0.9692 - val_loss: 0.1566 - val_accuracy: 0.9541\n",
+      "Epoch 12/15\n",
+      "11862/11862 [==============================] - 11s 927us/sample - loss: 0.0937 - accuracy: 0.9700 - val_loss: 0.1591 - val_accuracy: 0.9540\n",
+      "Epoch 13/15\n",
+      "11862/11862 [==============================] - 11s 927us/sample - loss: 0.0896 - accuracy: 0.9710 - val_loss: 0.1621 - val_accuracy: 0.9536\n",
+      "Epoch 14/15\n",
+      "11862/11862 [==============================] - 11s 954us/sample - loss: 0.0857 - accuracy: 0.9719 - val_loss: 0.1660 - val_accuracy: 0.9536\n",
+      "Epoch 15/15\n",
+      "11862/11862 [==============================] - 12s 1ms/sample - loss: 0.0820 - accuracy: 0.9729 - val_loss: 0.1690 - val_accuracy: 0.9538\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<keras.callbacks.History at 0x1cc31c0b250>"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "model = Sequential()\n",
+    "model.add(Embedding(vocab_size, embedding_dim, input_length=max_title_length))\n",
+    "model.add(LSTM(100))\n",
+    "model.add(Dense(tags.shape[1], activation='sigmoid'))\n",
+    "\n",
+    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
+    "\n",
+    "\n",
+    "model.fit(X_train, y_train, batch_size=64, epochs=15, validation_data=(X_test, y_test))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save('story_gen.h5')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "title = \"A oversized t-shirt\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "title_sequences = tokenizer.texts_to_sequences(title)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = model.predict(title_sequences)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input Title: Spider Man\n",
+      "Predicted Tags: [('murder',)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tensorflow.keras.models import load_model\n",
+    "with open('tokenizer.json', 'r') as f:\n",
+    "    tokenizer = tokenizer_from_json(f.read())\n",
+    "\n",
+    "model = load_model('story_gen.h5')  \n",
+    "\n",
+    "example_title = \"Spider Man\"\n",
+    "\n",
+    "example_sequence = tokenizer.texts_to_sequences([example_title])\n",
+    "example_sequence = pad_sequences(example_sequence, maxlen=max_title_length)\n",
+    "\n",
+    "predictions = model.predict(example_sequence)\n",
+    "\n",
+    "predicted_tags = mlb.inverse_transform((predictions > 0.5).astype(int))\n",
+    "\n",
+    "print(\"Input Title:\", example_title)\n",
+    "print(\"Predicted Tags:\", predicted_tags)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

story_gen.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb406194f44af2207635abf33c6c54cbbdb4e06ed14bdb3ef434b98fa806ecfb
+size 15675428

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff