In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

In [None]:
def copy_images(
 src_dir: Path,
 des_dir: Path,
 ids_with_plots: list,
 delete_existing_files: bool = False,
):
 """This function copies a poster to images folder if it's id is present in the ids_with_plots list"""

 images_list = []
 if delete_existing_files:
 shutil.rmtree(des_dir)

 des_dir.mkdir(parents=True, exist_ok=True)

 for f in src_dir.rglob("*"):
 try:
 if f.is_file() and f.suffix in [".jpg", ".jpeg", ".png"]:
 img_name = f.name
 id = Path(img_name).stem
 if id in ids_with_plots:
 desc_file = des_dir / img_name
 shutil.copy(f, desc_file)
 images_list.append((id, img_name))
 except Exception as e:
 print(f, e)
 return images_list

In [None]:
data_dir = Path("datasets").resolve()
images_dir = data_dir / "images"

In [None]:
movies_df = pd.read_csv(
 data_dir / "IMDb movies.csv", usecols=["imdb_title_id", "description"]
)
movies_df = movies_df.rename(columns={"imdb_title_id": "id", "description": "text"})
movies_df.dropna(subset=["text"], inplace=True) # Drop rows where text is empty
movies_df.head()


In [None]:
ids_with_plots = movies_df.id.tolist()

In [None]:
images_list = copy_images(data_dir / "Poster", images_dir, ids_with_plots)
images_list[0]

In [None]:
images_df = pd.DataFrame(images_list, columns=["id", "filename"])
images_df.head()

In [None]:
data_df = pd.merge(movies_df, images_df, on=["id"])
print(len(data_df))
data_df

In [None]:
print(len(data_df))
data_df.dropna(subset=["filename"], inplace=True)
print(len(data_df))

In [None]:
print(len(data_df))
data_df.dropna(subset=["text"], inplace=True)
print(len(data_df))

In [None]:
print(len(data_df))
data_df.drop_duplicates(subset=["id"], inplace=True)
print(len(data_df))

In [None]:
data_df.to_csv(data_dir / "data.csv", index=False)

In [None]:
train_df, valid_df = train_test_split(data_df, test_size=0.1, shuffle=True)
train_df.to_csv(data_dir / "train.csv", index=False)
valid_df.to_csv(data_dir / "valid.csv", index=False)
print(len(train_df), len(valid_df))