Spaces:

abd-meda
/

test

Sleeping

App Files Files Community

abdullahmeda commited on Jun 23, 2022

Commit

a4d40bc

•

1 Parent(s): b9309ba

-

Browse files

Files changed (22) hide show

README.md +31 -6
app.py +109 -0
examples/tt0068646-the-godfather.jpg +0 -0
examples/tt0076759-star-wars.jpg +0 -0
examples/tt0108778-friends.jpg +0 -0
examples/tt0109830-forrest-gump.jpg +0 -0
examples/tt0434409-v-for-vendetta.jpg +0 -0
examples/tt10062292-never-have-i-ever.jpg +0 -0
examples/tt10919420-squid-games.jpg +0 -0
examples/tt3521164-moana.jpg +0 -0
examples/tt6468322-money-heist.jpg +0 -0
examples/tt7991608-red-notice.jpg +0 -0
examples/tt8366590-baaghi3.jpg +0 -0
flagged/image/0.jpg +0 -0
flagged/image/1.jpg +0 -0
flagged/log.csv +3 -0
requirements.txt +4 -0
test.py +94 -0
train/README.md +9 -0
train/create_dataset.ipynb +326 -0
train/requirements.txt +7 -0
train/train.ipynb +474 -0

README.md CHANGED Viewed

@@ -1,12 +1,37 @@
 ---
-title: Test
-emoji: 🐠
-colorFrom: indigo
-colorTo: blue
 sdk: gradio
-sdk_version: 3.0.20
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Poster2plot
+emoji: 🎬
+colorFrom: purple
+colorTo: purple
 sdk: gradio
 app_file: app.py
 pinned: false
 ---
+# Configuration
+`title`: _string_
+Display title for the Space
+`emoji`: _string_
+Space emoji (emoji-only character allowed)
+`colorFrom`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`colorTo`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`sdk`: _string_
+Can be either `gradio` or `streamlit`
+`sdk_version` : _string_
+Only applicable for `streamlit` SDK.
+See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
+`app_file`: _string_
+Path to your main application file (which contains either `gradio` or `streamlit` Python code).
+Path is relative to the root of the repository.
+`pinned`: _boolean_
+Whether the Space stays on top of your list.

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import torch
+import re
+import gradio as gr
+from pathlib import Path
+from transformers import AutoTokenizer, AutoFeatureExtractor, VisionEncoderDecoderModel
+# Pattern to ignore all the text after 2 or more full stops
+regex_pattern = "[.]{2,}"
+def post_process(text):
+    try:
+        text = text.strip()
+        text = re.split(regex_pattern, text)[0]
+    except Exception as e:
+        print(e)
+        pass
+    return text
+def set_example_image(example: list) -> dict:
+    return gr.Image.update(value=example[0])
+def predict(image, max_length=64, num_beams=4):
+    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+    pixel_values = pixel_values.to(device)
+    with torch.no_grad():
+        output_ids = model.generate(
+            pixel_values,
+            max_length=max_length,
+            num_beams=num_beams,
+            return_dict_in_generate=True,
+        ).sequences
+    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+    pred = post_process(preds[0])
+    return pred
+model_name_or_path = "deepklarity/poster2plot"
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load model.
+model = VisionEncoderDecoderModel.from_pretrained(model_name_or_path)
+model.to(device)
+print("Loaded model")
+feature_extractor = AutoFeatureExtractor.from_pretrained(model.encoder.name_or_path)
+print("Loaded feature_extractor")
+tokenizer = AutoTokenizer.from_pretrained(model.decoder.name_or_path, use_fast=True)
+if model.decoder.name_or_path == "gpt2":
+    tokenizer.pad_token = tokenizer.eos_token
+print("Loaded tokenizer")
+title = "Poster2Plot: Upload a Movie/T.V show poster to generate a plot"
+description = ""
+input = gr.inputs.Image(type="pil")
+example_images = sorted(
+    [f.as_posix() for f in Path("examples").glob("*.jpg")]
+)
+print(f"Loaded {len(example_images)} example images")
+demo = gr.Blocks()
+filenames = next(os.walk('examples'), (None, None, []))[2]
+examples = [[f"examples/{filename}"] for filename in filenames]
+print(examples)
+with demo:
+    with gr.Column():
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image()
+                with gr.Row():
+                    clear_button = gr.Button(value="Clear", variant='secondary')
+                    submit_button = gr.Button(value="Submit", variant='primary')
+            with gr.Column():
+                plot = gr.Textbox()
+        with gr.Row():
+            example_images = gr.Dataset(components=[input_image], samples=examples)
+    submit_button.click(fn=predict, inputs=[input_image], outputs=[plot])
+    example_images.click(fn=set_example_image, inputs=[example_images], outputs=example_images.components)
+demo.launch()
+interface = gr.Interface(
+    fn=predict,
+    inputs=input,
+    outputs="textbox",
+    title=title,
+    description=description,
+    examples=example_images,
+    examples_per_page=20,
+    live=True,
+    article='<p>Made by: <a href="https://twitter.com/kartik_godawat" target="_blank" rel="noopener noreferrer">dk-crazydiv</a> and <a href="https://twitter.com/dsr_ai" target="_blank" rel="noopener noreferrer">dsr</a></p>'
+)
+interface.launch()

examples/tt0068646-the-godfather.jpg ADDED Viewed

examples/tt0076759-star-wars.jpg ADDED Viewed

examples/tt0108778-friends.jpg ADDED Viewed

examples/tt0109830-forrest-gump.jpg ADDED Viewed

examples/tt0434409-v-for-vendetta.jpg ADDED Viewed

examples/tt10062292-never-have-i-ever.jpg ADDED Viewed

examples/tt10919420-squid-games.jpg ADDED Viewed

examples/tt3521164-moana.jpg ADDED Viewed

examples/tt6468322-money-heist.jpg ADDED Viewed

examples/tt7991608-red-notice.jpg ADDED Viewed

examples/tt8366590-baaghi3.jpg ADDED Viewed

flagged/image/0.jpg ADDED Viewed

flagged/image/1.jpg ADDED Viewed

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+'image','output','flag','username','timestamp'
+'image/0.jpg','A young woman is forced to deal with her past when she is accused of murder. She tries to find out what happened to her husband, who is also accused of the crime. Will she be able to solve the case or will she be the one to save her husband''s life? Based on the true story of','','','2022-06-23 18:30:55.658016'
+'image/1.jpg','A young woman is forced to deal with her past when she is accused of murder. She tries to find out what happened to her husband, who is also accused of the crime. Will she be able to solve the case or will she be the one to save her husband''s life? Based on the true story of','','','2022-06-23 18:30:57.352462'

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+--find-links https://download.pytorch.org/whl/torch_stable.html
+gradio==2.9.0
+transformers==4.12.5
+torch==1.10.0+cpu

test.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import torch
+import re
+import gradio as gr
+from pathlib import Path
+from transformers import AutoTokenizer, AutoFeatureExtractor, VisionEncoderDecoderModel
+# Pattern to ignore all the text after 2 or more full stops
+regex_pattern = "[.]{2,}"
+def post_process(text):
+    try:
+        text = text.strip()
+        text = re.split(regex_pattern, text)[0]
+    except Exception as e:
+        print(e)
+        pass
+    return text
+def set_example_image(example: list) -> dict:
+    return gr.Image.update(value=example[0])
+def predict(image, max_length=64, num_beams=4):
+    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+    pixel_values = pixel_values.to(device)
+    with torch.no_grad():
+        output_ids = model.generate(
+            pixel_values,
+            max_length=max_length,
+            num_beams=num_beams,
+            return_dict_in_generate=True,
+        ).sequences
+    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+    pred = post_process(preds[0])
+    return pred
+model_name_or_path = "deepklarity/poster2plot"
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load model.
+model = VisionEncoderDecoderModel.from_pretrained(model_name_or_path)
+model.to(device)
+print("Loaded model")
+feature_extractor = AutoFeatureExtractor.from_pretrained(model.encoder.name_or_path)
+print("Loaded feature_extractor")
+tokenizer = AutoTokenizer.from_pretrained(model.decoder.name_or_path, use_fast=True)
+if model.decoder.name_or_path == "gpt2":
+    tokenizer.pad_token = tokenizer.eos_token
+print("Loaded tokenizer")
+title = "Poster2Plot: Upload a Movie/T.V show poster to generate a plot"
+description = ""
+input = gr.inputs.Image(type="pil")
+example_images = sorted(
+    [f.as_posix() for f in Path("examples").glob("*.jpg")]
+)
+print(f"Loaded {len(example_images)} example images")
+demo = gr.Blocks()
+filenames = next(os.walk('examples'), (None, None, []))[2]
+examples = [[f"examples/{filename}"] for filename in filenames]
+print(examples)
+with demo:
+    with gr.Column():
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image()
+                with gr.Row():
+                    clear_button = gr.Button(value="Clear", variant='secondary')
+                    submit_button = gr.Button(value="Submit", variant='primary')
+            with gr.Column():
+                plot = gr.Textbox()
+        with gr.Row():
+            example_images = gr.Dataset(components=[input_image], samples=examples)
+    submit_button.click(fn=predict, inputs=[input_image], outputs=[plot])
+    example_images.click(fn=set_example_image, inputs=[example_images], outputs=example_images.components)
+demo.launch()

train/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# Train new model
+- Download and extract the following datasets in a new folder called datasets:
+  1. [IMDb movies extensive dataset](https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset)
+  2. [48K IMDB Movies With Posters](https://www.kaggle.com/rezaunderfit/48k-imdb-movies-with-posters)
+- Run `create_dataset.ipynb` to create train.csv and valid.csv
+- Run `train.ipynb` to train the model

train/create_dataset.ipynb ADDED Viewed

	@@ -0,0 +1,326 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fbed7bc",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:46:29.851016Z",
+     "start_time": "2021-12-09T16:46:29.841794Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99d6f14d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:46:30.336104Z",
+     "start_time": "2021-12-09T16:46:29.852308Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import pandas as pd\n",
+    "import shutil\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8fcf96c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:46:30.349125Z",
+     "start_time": "2021-12-09T16:46:30.337223Z"
+    },
+    "code_folding": [],
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def copy_images(\n",
+    "    src_dir: Path,\n",
+    "    des_dir: Path,\n",
+    "    ids_with_plots: list,\n",
+    "    delete_existing_files: bool = False,\n",
+    "):\n",
+    "    \"\"\"This function copies a poster to images folder if it's id is present in the ids_with_plots list\"\"\"\n",
+    "\n",
+    "    images_list = []\n",
+    "    if delete_existing_files:\n",
+    "        shutil.rmtree(des_dir)\n",
+    "\n",
+    "    des_dir.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "    for f in src_dir.rglob(\"*\"):\n",
+    "        try:\n",
+    "            if f.is_file() and f.suffix in [\".jpg\", \".jpeg\", \".png\"]:\n",
+    "                img_name = f.name\n",
+    "                id = Path(img_name).stem\n",
+    "                if id in ids_with_plots:\n",
+    "                    desc_file = des_dir / img_name\n",
+    "                    shutil.copy(f, desc_file)\n",
+    "                    images_list.append((id, img_name))\n",
+    "        except Exception as e:\n",
+    "            print(f, e)\n",
+    "    return images_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a34124b2",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:46:30.359361Z",
+     "start_time": "2021-12-09T16:46:30.350299Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data_dir = Path(\"datasets\").resolve()\n",
+    "images_dir = data_dir / \"images\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8714ea01",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:46:30.781046Z",
+     "start_time": "2021-12-09T16:46:30.360608Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "movies_df = pd.read_csv(\n",
+    "    data_dir / \"IMDb movies.csv\", usecols=[\"imdb_title_id\", \"description\"]\n",
+    ")\n",
+    "movies_df = movies_df.rename(columns={\"imdb_title_id\": \"id\", \"description\": \"text\"})\n",
+    "movies_df.dropna(subset=[\"text\"], inplace=True)  # Drop rows where text is empty\n",
+    "movies_df.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27f7fd94",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:46:30.792761Z",
+     "start_time": "2021-12-09T16:46:30.781964Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "ids_with_plots = movies_df.id.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebaa042a",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:47:04.704390Z",
+     "start_time": "2021-12-09T16:46:30.794094Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "images_list = copy_images(data_dir / \"Poster\", images_dir, ids_with_plots)\n",
+    "images_list[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17e0a874",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:47:04.724427Z",
+     "start_time": "2021-12-09T16:47:04.705540Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "images_df = pd.DataFrame(images_list, columns=[\"id\", \"filename\"])\n",
+    "images_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb1114e6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:47:04.772775Z",
+     "start_time": "2021-12-09T16:47:04.725707Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data_df = pd.merge(movies_df, images_df, on=[\"id\"])\n",
+    "print(len(data_df))\n",
+    "data_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6790815b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:47:04.796785Z",
+     "start_time": "2021-12-09T16:47:04.774932Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print(len(data_df))\n",
+    "data_df.dropna(subset=[\"filename\"], inplace=True)\n",
+    "print(len(data_df))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40c7205d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:47:04.818522Z",
+     "start_time": "2021-12-09T16:47:04.798063Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print(len(data_df))\n",
+    "data_df.dropna(subset=[\"text\"], inplace=True)\n",
+    "print(len(data_df))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a2d142f",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:47:04.838450Z",
+     "start_time": "2021-12-09T16:47:04.819726Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print(len(data_df))\n",
+    "data_df.drop_duplicates(subset=[\"id\"], inplace=True)\n",
+    "print(len(data_df))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45f4b970",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:47:04.971652Z",
+     "start_time": "2021-12-09T16:47:04.839618Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data_df.to_csv(data_dir / \"data.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8019a02",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:47:05.104710Z",
+     "start_time": "2021-12-09T16:47:04.972681Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train_df, valid_df = train_test_split(data_df, test_size=0.1, shuffle=True)\n",
+    "train_df.to_csv(data_dir / \"train.csv\", index=False)\n",
+    "valid_df.to_csv(data_dir / \"valid.csv\", index=False)\n",
+    "print(len(train_df), len(valid_df))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "huggingface",
+   "language": "python",
+   "name": "huggingface"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

train/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+--find-links https://download.pytorch.org/whl/torch_stable.html
+pandas==1.3.4
+scikit-learn==1.0.1
+python-box==5.4.1
+transformers==4.12.5
+torch==1.10.0+cu113
+Pillow==8.4.0

train/train.ipynb ADDED Viewed

	@@ -0,0 +1,474 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fbed7bc",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:14.921553Z",
+     "start_time": "2021-12-09T15:34:14.911112Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c4b60ef3",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:15.961098Z",
+     "start_time": "2021-12-09T15:34:14.922771Z"
+    },
+    "code_folding": []
+   },
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "from PIL import Image\n",
+    "import shutil\n",
+    "from logging import root\n",
+    "from PIL import Image\n",
+    "from pathlib import Path\n",
+    "import pandas as pd\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset\n",
+    "from PIL import Image\n",
+    "from transformers import (\n",
+    "    Seq2SeqTrainer,\n",
+    "    Seq2SeqTrainingArguments,\n",
+    "    get_linear_schedule_with_warmup,\n",
+    "    AutoFeatureExtractor,\n",
+    "    AutoTokenizer,\n",
+    "    ViTFeatureExtractor,\n",
+    "    VisionEncoderDecoderModel,\n",
+    "    default_data_collator,\n",
+    ")\n",
+    "from transformers.optimization import AdamW\n",
+    "\n",
+    "from box import Box\n",
+    "import inspect\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99d6f14d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:15.979191Z",
+     "start_time": "2021-12-09T15:34:15.962078Z"
+    },
+    "code_folding": []
+   },
+   "outputs": [],
+   "source": [
+    "# custom functions\n",
+    "\n",
+    "class ImageCaptionDataset(Dataset):\n",
+    "    def __init__(\n",
+    "        self, df, feature_extractor, tokenizer, images_dir, max_target_length=128\n",
+    "    ):\n",
+    "        self.df = df\n",
+    "        self.feature_extractor = feature_extractor\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.images_dir = images_dir\n",
+    "        self.max_target_length = max_target_length\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.df)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        filename = self.df[\"filename\"][idx]\n",
+    "        text = self.df[\"text\"][idx]\n",
+    "        # prepare image (i.e. resize + normalize)\n",
+    "        image = Image.open(self.images_dir / filename).convert(\"RGB\")\n",
+    "        pixel_values = self.feature_extractor(image, return_tensors=\"pt\").pixel_values\n",
+    "        # add labels (input_ids) by encoding the text\n",
+    "        labels = self.tokenizer(\n",
+    "            text,\n",
+    "            padding=\"max_length\",\n",
+    "            truncation=True,\n",
+    "            max_length=self.max_target_length,\n",
+    "        ).input_ids\n",
+    "        # important: make sure that PAD tokens are ignored by the loss function\n",
+    "        labels = [\n",
+    "            label if label != self.tokenizer.pad_token_id else -100 for label in labels\n",
+    "        ]\n",
+    "\n",
+    "        encoding = {\n",
+    "            \"pixel_values\": pixel_values.squeeze(),\n",
+    "            \"labels\": torch.tensor(labels),\n",
+    "        }\n",
+    "        return encoding\n",
+    "\n",
+    "\n",
+    "\n",
+    "def predict(image, max_length=64, num_beams=4):\n",
+    "\n",
+    "    pixel_values = feature_extractor(images=image, return_tensors=\"pt\").pixel_values\n",
+    "    pixel_values = pixel_values.to(device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        output_ids = model.generate(\n",
+    "            pixel_values,\n",
+    "            max_length=max_length,\n",
+    "            num_beams=num_beams,\n",
+    "            return_dict_in_generate=True,\n",
+    "        ).sequences\n",
+    "\n",
+    "    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)\n",
+    "    preds = [pred.strip() for pred in preds]\n",
+    "\n",
+    "    return preds\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea66826b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:16.042990Z",
+     "start_time": "2021-12-09T15:34:15.980557Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data_dir = Path(\"datasets\").resolve()\n",
+    "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
+    "print(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17cfb2c2",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:16.058421Z",
+     "start_time": "2021-12-09T15:34:16.044111Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# arguments pertaining to what data we are going to input our model for training and eval.\n",
+    "\n",
+    "data_training_args = {\n",
+    "    # The maximum total sequence length for target text after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.\n",
+    "    \"max_target_length\": 64,\n",
+    "\n",
+    "    # Number of beams to use for evaluation. This argument will be passed to model.generate which is used during evaluate and predict.\n",
+    "    \"num_beams\": 4,\n",
+    "\n",
+    "    # Folder with all the images\n",
+    "    \"images_dir\": data_dir / \"images\",\n",
+    "}\n",
+    "\n",
+    "data_training_args = Box(data_training_args)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "adc4839a",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:16.073242Z",
+     "start_time": "2021-12-09T15:34:16.059354Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# arguments pertaining to which model/config/tokenizer we are going to fine-tune from.\n",
+    "\n",
+    "model_args = {\n",
+    "\n",
+    "    # Path to pretrained model or model identifier from huggingface.co/models\"\n",
+    "    \"encoder_model_name_or_path\": \"google/vit-base-patch16-224-in21k\",\n",
+    "\n",
+    "    # Path to pretrained model or model identifier from huggingface.co/models\"\n",
+    "    \"decoder_model_name_or_path\": \"gpt2\",\n",
+    "\n",
+    "    # If set to int > 0, all ngrams of that size can only occur once.\n",
+    "    \"no_repeat_ngram_size\": 3,\n",
+    "\n",
+    "    # Exponential penalty to the length that will be used by default in the generate method of the model.\n",
+    "    \"length_penalty\": 2.0,\n",
+    "}\n",
+    "\n",
+    "model_args = Box(model_args)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22b8c9e3",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:16.089201Z",
+     "start_time": "2021-12-09T15:34:16.074223Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# arguments pertaining to Trainer class. Refer: https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments\n",
+    "\n",
+    "training_args = {\n",
+    "    \"num_train_epochs\": 5,\n",
+    "    \"per_device_train_batch_size\": 32,\n",
+    "    \"per_device_eval_batch_size\": 32,\n",
+    "    \"output_dir\": \"output_dir\",\n",
+    "    \"do_train\": True,\n",
+    "    \"do_eval\": True,\n",
+    "    \"fp16\": True,\n",
+    "    \"learning_rate\": 1e-5,\n",
+    "    \"load_best_model_at_end\": True,\n",
+    "    \"evaluation_strategy\": \"epoch\",\n",
+    "    \"save_strategy\": \"epoch\",\n",
+    "    \"report_to\": \"none\"\n",
+    "}\n",
+    "\n",
+    "seq2seq_training_args = Seq2SeqTrainingArguments(**training_args)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0023eac",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:37.844396Z",
+     "start_time": "2021-12-09T15:34:16.090085Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "feature_extractor = ViTFeatureExtractor.from_pretrained(\n",
+    "    model_args.encoder_model_name_or_path\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    model_args.decoder_model_name_or_path, use_fast=True\n",
+    ")\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "\n",
+    "model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(\n",
+    "    model_args.encoder_model_name_or_path, model_args.decoder_model_name_or_path\n",
+    ")\n",
+    "\n",
+    "# set special tokens used for creating the decoder_input_ids from the labels\n",
+    "model.config.decoder_start_token_id = tokenizer.bos_token_id\n",
+    "model.config.pad_token_id = tokenizer.pad_token_id\n",
+    "# make sure vocab size is set correctly\n",
+    "model.config.vocab_size = model.config.decoder.vocab_size\n",
+    "\n",
+    "# set beam search parameters\n",
+    "model.config.eos_token_id = tokenizer.sep_token_id\n",
+    "model.config.max_length = data_training_args.max_target_length\n",
+    "model.config.no_repeat_ngram_size = model_args.no_repeat_ngram_size\n",
+    "model.config.length_penalty = model_args.length_penalty\n",
+    "model.config.num_beams = data_training_args.num_beams\n",
+    "model.decoder.resize_token_embeddings(len(tokenizer))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6428ea08",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:37.933804Z",
+     "start_time": "2021-12-09T15:34:37.845607Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "train_df = pd.read_csv(data_dir / \"train.csv\")\n",
+    "valid_df = pd.read_csv(data_dir / \"valid.csv\")\n",
+    "\n",
+    "train_dataset = ImageCaptionDataset(\n",
+    "    df=train_df,\n",
+    "    feature_extractor=feature_extractor,\n",
+    "    tokenizer=tokenizer,\n",
+    "    images_dir=data_training_args.images_dir,\n",
+    "    max_target_length=data_training_args.max_target_length,\n",
+    ")\n",
+    "eval_dataset = ImageCaptionDataset(\n",
+    "    df=valid_df,\n",
+    "    feature_extractor=feature_extractor,\n",
+    "    tokenizer=tokenizer,\n",
+    "    images_dir=data_training_args.images_dir,\n",
+    "    max_target_length=data_training_args.max_target_length,\n",
+    ")\n",
+    "\n",
+    "print(f\"Number of training examples: {len(train_dataset)}\")\n",
+    "print(f\"Number of validation examples: {len(eval_dataset)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8e492a1",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:37.971630Z",
+     "start_time": "2021-12-09T15:34:37.935339Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Let's verify an example from the training dataset:\n",
+    "\n",
+    "encoding = train_dataset[0]\n",
+    "for k,v in encoding.items():\n",
+    "    print(k, v.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edb4e7a6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:38.006980Z",
+     "start_time": "2021-12-09T15:34:37.972483Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# We can also check the original image and decode the labels:\n",
+    "image = Image.open(data_training_args.images_dir / train_df[\"filename\"][0]).convert(\"RGB\")\n",
+    "image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25f2cae7",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:38.031745Z",
+     "start_time": "2021-12-09T15:34:38.008027Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "labels = encoding[\"labels\"]\n",
+    "labels[labels == -100] = tokenizer.pad_token_id\n",
+    "label_str = tokenizer.decode(labels, skip_special_tokens=True)\n",
+    "print(label_str)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7a009d3",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T15:34:38.049539Z",
+     "start_time": "2021-12-09T15:34:38.032749Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "optimizer = AdamW(model.parameters(), lr=seq2seq_training_args.learning_rate)\n",
+    "\n",
+    "steps_per_epoch = len(train_dataset) // seq2seq_training_args.per_device_train_batch_size\n",
+    "num_training_steps = steps_per_epoch * seq2seq_training_args.num_train_epochs\n",
+    "\n",
+    "lr_scheduler = get_linear_schedule_with_warmup(\n",
+    "    optimizer,\n",
+    "    num_warmup_steps=seq2seq_training_args.warmup_steps,\n",
+    "    num_training_steps=num_training_steps,\n",
+    ")\n",
+    "\n",
+    "optimizers = (optimizer, lr_scheduler)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f2f477b2",
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2021-12-09T15:34:14.944Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "trainer = Seq2SeqTrainer(\n",
+    "    model=model,\n",
+    "    optimizers=optimizers,\n",
+    "    tokenizer=feature_extractor,\n",
+    "    args=seq2seq_training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    "    eval_dataset=eval_dataset,\n",
+    "    data_collator=default_data_collator,\n",
+    ")\n",
+    "\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f08d2b7c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-12-09T16:24:49.096274Z",
+     "start_time": "2021-12-09T16:24:49.096246Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "test_img = \"../examples/tt7991608-red-notice.jpg\"\n",
+    "with Image.open(test_img) as image:\n",
+    "    preds = predict(\n",
+    "        image, max_length=data_training_args.max_target_length, num_beams=data_training_args.num_beams\n",
+    "    )\n",
+    "\n",
+    "# Uncomment to display the test image in a jupyter notebook\n",
+    "# display(image)\n",
+    "print(preds[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ecf21225",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "huggingface",
+   "language": "python",
+   "name": "huggingface"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}