{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "0fbed7bc", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:46:29.851016Z", "start_time": "2021-12-09T16:46:29.841794Z" } }, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "id": "99d6f14d", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:46:30.336104Z", "start_time": "2021-12-09T16:46:29.852308Z" } }, "outputs": [], "source": [ "from pathlib import Path\n", "import pandas as pd\n", "import shutil\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": null, "id": "c8fcf96c", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:46:30.349125Z", "start_time": "2021-12-09T16:46:30.337223Z" }, "code_folding": [] }, "outputs": [], "source": [ "def copy_images(\n", " src_dir: Path,\n", " des_dir: Path,\n", " ids_with_plots: list,\n", " delete_existing_files: bool = False,\n", "):\n", " \"\"\"This function copies a poster to images folder if it's id is present in the ids_with_plots list\"\"\"\n", "\n", " images_list = []\n", " if delete_existing_files:\n", " shutil.rmtree(des_dir)\n", "\n", " des_dir.mkdir(parents=True, exist_ok=True)\n", "\n", " for f in src_dir.rglob(\"*\"):\n", " try:\n", " if f.is_file() and f.suffix in [\".jpg\", \".jpeg\", \".png\"]:\n", " img_name = f.name\n", " id = Path(img_name).stem\n", " if id in ids_with_plots:\n", " desc_file = des_dir / img_name\n", " shutil.copy(f, desc_file)\n", " images_list.append((id, img_name))\n", " except Exception as e:\n", " print(f, e)\n", " return images_list" ] }, { "cell_type": "code", "execution_count": null, "id": "a34124b2", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:46:30.359361Z", "start_time": "2021-12-09T16:46:30.350299Z" } }, "outputs": [], "source": [ "data_dir = Path(\"datasets\").resolve()\n", "images_dir = data_dir / \"images\"" ] }, { "cell_type": "code", "execution_count": null, "id": "8714ea01", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:46:30.781046Z", "start_time": "2021-12-09T16:46:30.360608Z" } }, "outputs": [], "source": [ "movies_df = pd.read_csv(\n", " data_dir / \"IMDb movies.csv\", usecols=[\"imdb_title_id\", \"description\"]\n", ")\n", "movies_df = movies_df.rename(columns={\"imdb_title_id\": \"id\", \"description\": \"text\"})\n", "movies_df.dropna(subset=[\"text\"], inplace=True) # Drop rows where text is empty\n", "movies_df.head()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "27f7fd94", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:46:30.792761Z", "start_time": "2021-12-09T16:46:30.781964Z" } }, "outputs": [], "source": [ "ids_with_plots = movies_df.id.tolist()" ] }, { "cell_type": "code", "execution_count": null, "id": "ebaa042a", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:47:04.704390Z", "start_time": "2021-12-09T16:46:30.794094Z" } }, "outputs": [], "source": [ "images_list = copy_images(data_dir / \"Poster\", images_dir, ids_with_plots)\n", "images_list[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "17e0a874", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:47:04.724427Z", "start_time": "2021-12-09T16:47:04.705540Z" } }, "outputs": [], "source": [ "images_df = pd.DataFrame(images_list, columns=[\"id\", \"filename\"])\n", "images_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "bb1114e6", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:47:04.772775Z", "start_time": "2021-12-09T16:47:04.725707Z" } }, "outputs": [], "source": [ "data_df = pd.merge(movies_df, images_df, on=[\"id\"])\n", "print(len(data_df))\n", "data_df" ] }, { "cell_type": "code", "execution_count": null, "id": "6790815b", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:47:04.796785Z", "start_time": "2021-12-09T16:47:04.774932Z" } }, "outputs": [], "source": [ "print(len(data_df))\n", "data_df.dropna(subset=[\"filename\"], inplace=True)\n", "print(len(data_df))" ] }, { "cell_type": "code", "execution_count": null, "id": "40c7205d", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:47:04.818522Z", "start_time": "2021-12-09T16:47:04.798063Z" } }, "outputs": [], "source": [ "print(len(data_df))\n", "data_df.dropna(subset=[\"text\"], inplace=True)\n", "print(len(data_df))" ] }, { "cell_type": "code", "execution_count": null, "id": "9a2d142f", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:47:04.838450Z", "start_time": "2021-12-09T16:47:04.819726Z" } }, "outputs": [], "source": [ "print(len(data_df))\n", "data_df.drop_duplicates(subset=[\"id\"], inplace=True)\n", "print(len(data_df))" ] }, { "cell_type": "code", "execution_count": null, "id": "45f4b970", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:47:04.971652Z", "start_time": "2021-12-09T16:47:04.839618Z" } }, "outputs": [], "source": [ "data_df.to_csv(data_dir / \"data.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "f8019a02", "metadata": { "ExecuteTime": { "end_time": "2021-12-09T16:47:05.104710Z", "start_time": "2021-12-09T16:47:04.972681Z" } }, "outputs": [], "source": [ "train_df, valid_df = train_test_split(data_df, test_size=0.1, shuffle=True)\n", "train_df.to_csv(data_dir / \"train.csv\", index=False)\n", "valid_df.to_csv(data_dir / \"valid.csv\", index=False)\n", "print(len(train_df), len(valid_df))" ] } ], "metadata": { "kernelspec": { "display_name": "huggingface", "language": "python", "name": "huggingface" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }