added in_memory loading to reduce disk reads and increase speed

Files changed (2) hide show

notebooks/1.0-hfk-datamodules-exploration.ipynb +70 -356
src/datamodules/focus_datamodule.py +28 -9

notebooks/1.0-hfk-datamodules-exploration.ipynb CHANGED Viewed

@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -27,235 +27,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Unnamed: 0</th>\n",
-       "      <th>image_path</th>\n",
-       "      <th>original_filename</th>\n",
-       "      <th>study_id</th>\n",
-       "      <th>scan_uuid</th>\n",
-       "      <th>focus_value</th>\n",
-       "      <th>stack_id</th>\n",
-       "      <th>obj_name</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>31/fba56d57-656e-4b6f-ba63-e4ba3ad083f5/I01631...</td>\n",
-       "      <td>I01631_X013_Y012_Z5107.jpg</td>\n",
-       "      <td>31</td>\n",
-       "      <td>fba56d57-656e-4b6f-ba63-e4ba3ad083f5</td>\n",
-       "      <td>-2.82953</td>\n",
-       "      <td>1658220</td>\n",
-       "      <td>133</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>31/fba56d57-656e-4b6f-ba63-e4ba3ad083f5/I01632...</td>\n",
-       "      <td>I01632_X013_Y012_Z5175.jpg</td>\n",
-       "      <td>31</td>\n",
-       "      <td>fba56d57-656e-4b6f-ba63-e4ba3ad083f5</td>\n",
-       "      <td>-2.70408</td>\n",
-       "      <td>1658220</td>\n",
-       "      <td>133</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2</td>\n",
-       "      <td>31/fba56d57-656e-4b6f-ba63-e4ba3ad083f5/I01633...</td>\n",
-       "      <td>I01633_X013_Y012_Z5722.jpg</td>\n",
-       "      <td>31</td>\n",
-       "      <td>fba56d57-656e-4b6f-ba63-e4ba3ad083f5</td>\n",
-       "      <td>-2.69918</td>\n",
-       "      <td>1658220</td>\n",
-       "      <td>133</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>3</td>\n",
-       "      <td>31/fba56d57-656e-4b6f-ba63-e4ba3ad083f5/I01634...</td>\n",
-       "      <td>I01634_X013_Y012_Z5244.jpg</td>\n",
-       "      <td>31</td>\n",
-       "      <td>fba56d57-656e-4b6f-ba63-e4ba3ad083f5</td>\n",
-       "      <td>-2.50266</td>\n",
-       "      <td>1658220</td>\n",
-       "      <td>133</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>4</td>\n",
-       "      <td>31/fba56d57-656e-4b6f-ba63-e4ba3ad083f5/I01635...</td>\n",
-       "      <td>I01635_X013_Y012_Z5654.jpg</td>\n",
-       "      <td>31</td>\n",
-       "      <td>fba56d57-656e-4b6f-ba63-e4ba3ad083f5</td>\n",
-       "      <td>-2.36450</td>\n",
-       "      <td>1658220</td>\n",
-       "      <td>133</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>565</th>\n",
-       "      <td>565</td>\n",
-       "      <td>31/4c7e9e66-61a1-47ca-aa4e-340b0eef8db1/I01406...</td>\n",
-       "      <td>I01406_X016_Y009_Z5361.jpg</td>\n",
-       "      <td>31</td>\n",
-       "      <td>4c7e9e66-61a1-47ca-aa4e-340b0eef8db1</td>\n",
-       "      <td>-3.41147</td>\n",
-       "      <td>1674918</td>\n",
-       "      <td>217</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>566</th>\n",
-       "      <td>566</td>\n",
-       "      <td>31/4c7e9e66-61a1-47ca-aa4e-340b0eef8db1/I01407...</td>\n",
-       "      <td>I01407_X016_Y009_Z5087.jpg</td>\n",
-       "      <td>31</td>\n",
-       "      <td>4c7e9e66-61a1-47ca-aa4e-340b0eef8db1</td>\n",
-       "      <td>-3.05424</td>\n",
-       "      <td>1674918</td>\n",
-       "      <td>217</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>567</th>\n",
-       "      <td>567</td>\n",
-       "      <td>31/4c7e9e66-61a1-47ca-aa4e-340b0eef8db1/I01408...</td>\n",
-       "      <td>I01408_X016_Y009_Z5292.jpg</td>\n",
-       "      <td>31</td>\n",
-       "      <td>4c7e9e66-61a1-47ca-aa4e-340b0eef8db1</td>\n",
-       "      <td>-1.48608</td>\n",
-       "      <td>1674918</td>\n",
-       "      <td>217</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>568</th>\n",
-       "      <td>568</td>\n",
-       "      <td>31/4c7e9e66-61a1-47ca-aa4e-340b0eef8db1/I01409...</td>\n",
-       "      <td>I01409_X016_Y009_Z5156.jpg</td>\n",
-       "      <td>31</td>\n",
-       "      <td>4c7e9e66-61a1-47ca-aa4e-340b0eef8db1</td>\n",
-       "      <td>-0.52804</td>\n",
-       "      <td>1674918</td>\n",
-       "      <td>217</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>569</th>\n",
-       "      <td>569</td>\n",
-       "      <td>31/4c7e9e66-61a1-47ca-aa4e-340b0eef8db1/I01410...</td>\n",
-       "      <td>I01410_X016_Y009_Z5224.jpg</td>\n",
-       "      <td>31</td>\n",
-       "      <td>4c7e9e66-61a1-47ca-aa4e-340b0eef8db1</td>\n",
-       "      <td>0.00000</td>\n",
-       "      <td>1674918</td>\n",
-       "      <td>217</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>570 rows × 8 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     Unnamed: 0                                         image_path  \\\n",
-       "0             0  31/fba56d57-656e-4b6f-ba63-e4ba3ad083f5/I01631...   \n",
-       "1             1  31/fba56d57-656e-4b6f-ba63-e4ba3ad083f5/I01632...   \n",
-       "2             2  31/fba56d57-656e-4b6f-ba63-e4ba3ad083f5/I01633...   \n",
-       "3             3  31/fba56d57-656e-4b6f-ba63-e4ba3ad083f5/I01634...   \n",
-       "4             4  31/fba56d57-656e-4b6f-ba63-e4ba3ad083f5/I01635...   \n",
-       "..          ...                                                ...   \n",
-       "565         565  31/4c7e9e66-61a1-47ca-aa4e-340b0eef8db1/I01406...   \n",
-       "566         566  31/4c7e9e66-61a1-47ca-aa4e-340b0eef8db1/I01407...   \n",
-       "567         567  31/4c7e9e66-61a1-47ca-aa4e-340b0eef8db1/I01408...   \n",
-       "568         568  31/4c7e9e66-61a1-47ca-aa4e-340b0eef8db1/I01409...   \n",
-       "569         569  31/4c7e9e66-61a1-47ca-aa4e-340b0eef8db1/I01410...   \n",
-       "\n",
-       "              original_filename  study_id  \\\n",
-       "0    I01631_X013_Y012_Z5107.jpg        31   \n",
-       "1    I01632_X013_Y012_Z5175.jpg        31   \n",
-       "2    I01633_X013_Y012_Z5722.jpg        31   \n",
-       "3    I01634_X013_Y012_Z5244.jpg        31   \n",
-       "4    I01635_X013_Y012_Z5654.jpg        31   \n",
-       "..                          ...       ...   \n",
-       "565  I01406_X016_Y009_Z5361.jpg        31   \n",
-       "566  I01407_X016_Y009_Z5087.jpg        31   \n",
-       "567  I01408_X016_Y009_Z5292.jpg        31   \n",
-       "568  I01409_X016_Y009_Z5156.jpg        31   \n",
-       "569  I01410_X016_Y009_Z5224.jpg        31   \n",
-       "\n",
-       "                                scan_uuid  focus_value  stack_id  obj_name  \n",
-       "0    fba56d57-656e-4b6f-ba63-e4ba3ad083f5     -2.82953   1658220       133  \n",
-       "1    fba56d57-656e-4b6f-ba63-e4ba3ad083f5     -2.70408   1658220       133  \n",
-       "2    fba56d57-656e-4b6f-ba63-e4ba3ad083f5     -2.69918   1658220       133  \n",
-       "3    fba56d57-656e-4b6f-ba63-e4ba3ad083f5     -2.50266   1658220       133  \n",
-       "4    fba56d57-656e-4b6f-ba63-e4ba3ad083f5     -2.36450   1658220       133  \n",
-       "..                                    ...          ...       ...       ...  \n",
-       "565  4c7e9e66-61a1-47ca-aa4e-340b0eef8db1     -3.41147   1674918       217  \n",
-       "566  4c7e9e66-61a1-47ca-aa4e-340b0eef8db1     -3.05424   1674918       217  \n",
-       "567  4c7e9e66-61a1-47ca-aa4e-340b0eef8db1     -1.48608   1674918       217  \n",
-       "568  4c7e9e66-61a1-47ca-aa4e-340b0eef8db1     -0.52804   1674918       217  \n",
-       "569  4c7e9e66-61a1-47ca-aa4e-340b0eef8db1      0.00000   1674918       217  \n",
-       "\n",
-       "[570 rows x 8 columns]"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "metadata"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'31/fba56d57-656e-4b6f-ba63-e4ba3ad083f5/I01631_X013_Y012_Z5107_600_375.jpg'"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "idx = 0\n",
     "# File Path\n",
@@ -264,20 +47,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "-2.82953"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "# Focus Value\n",
     "metadata.iloc[idx, 5]"
@@ -292,76 +64,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "570\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'image': array([[[172, 173, 159],\n",
-       "         [166, 167, 153],\n",
-       "         [171, 173, 160],\n",
-       "         ...,\n",
-       "         [199, 202, 173],\n",
-       "         [199, 202, 173],\n",
-       "         [200, 201, 170]],\n",
-       " \n",
-       "        [[167, 169, 155],\n",
-       "         [164, 166, 152],\n",
-       "         [171, 175, 160],\n",
-       "         ...,\n",
-       "         [194, 197, 168],\n",
-       "         [195, 198, 169],\n",
-       "         [199, 200, 169]],\n",
-       " \n",
-       "        [[146, 153, 135],\n",
-       "         [149, 156, 138],\n",
-       "         [163, 172, 153],\n",
-       "         ...,\n",
-       "         [189, 192, 163],\n",
-       "         [191, 194, 165],\n",
-       "         [197, 198, 167]],\n",
-       " \n",
-       "        ...,\n",
-       " \n",
-       "        [[ 57,  62,  68],\n",
-       "         [ 41,  46,  52],\n",
-       "         [ 24,  31,  39],\n",
-       "         ...,\n",
-       "         [198, 189, 180],\n",
-       "         [188, 179, 170],\n",
-       "         [180, 171, 164]],\n",
-       " \n",
-       "        [[ 46,  51,  57],\n",
-       "         [ 34,  39,  45],\n",
-       "         [ 21,  28,  36],\n",
-       "         ...,\n",
-       "         [208, 200, 189],\n",
-       "         [197, 190, 180],\n",
-       "         [188, 181, 173]],\n",
-       " \n",
-       "        [[ 31,  39,  42],\n",
-       "         [ 23,  31,  34],\n",
-       "         [ 18,  25,  31],\n",
-       "         ...,\n",
-       "         [215, 209, 197],\n",
-       "         [205, 199, 187],\n",
-       "         [197, 190, 180]]], dtype=uint8),\n",
-       " 'focus_value': 0.0}"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "from importlib.machinery import SourceFileLoader\n",
     "\n",
@@ -370,18 +75,15 @@
     "\n",
     "ds = FocusDataSet(\"../data/focus/metadata.csv\", \"../data/focus/\")\n",
     "\n",
-    "counter = 0\n",
     "for d in ds:\n",
-    "    counter += 1\n",
-    "\n",
-    "print(counter)\n",
     "\n",
     "d"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -393,20 +95,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "64"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "for data in datamodule.test_dataloader():\n",
     "    break\n",
@@ -416,40 +107,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/hku/.local/lib/python3.8/site-packages/torch/nn/modules/loss.py:96: UserWarning: Using a target size (torch.Size([64])) that is different to the input size (torch.Size([64, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
-      "  return F.l1_loss(input, target, reduction=self.reduction)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(tensor(2.5787, grad_fn=<L1LossBackward0>),\n",
-       " tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n",
-       " tensor([-1.2805, -0.0943, -2.3645,  0.8542, -0.8047, -6.0020,  0.0000, -4.3352,\n",
-       "         -1.8066, -2.7189, -6.4697, -3.2557, -4.2778, -5.0264, -3.4891,  0.0000,\n",
-       "         -1.7181, -2.7314,  0.3324, -0.0943, -0.8991,  0.0000, -4.4178,  1.9723,\n",
-       "         -3.0026, -5.5685,  3.8374,  3.8625, -0.4125, -4.1936, -1.5781, -1.6393,\n",
-       "         -2.9583, -5.4933, -1.7807, -3.3135, -5.3423, -0.7978, -5.3971, -4.9412,\n",
-       "          0.0000, -4.4128, -5.7744, -5.2755, -1.0996, -5.7482,  0.0000, -0.1737,\n",
-       "         -3.5851, -6.1429, -6.3642, -3.9653, -0.2081, -0.9539, -0.4159, -0.5388,\n",
-       "         -1.3643, -4.4441, -1.5161,  0.6395, -5.4710, -2.6482,  0.0000, -2.6257],\n",
-       "        dtype=torch.float64))"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "import types\n",
     "import importlib.machinery\n",
@@ -460,6 +120,60 @@
     "\n",
     "model.step(data)"
    ]
   }
  ],
  "metadata": {

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "metadata"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "idx = 0\n",
     "# File Path\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "# Focus Value\n",
     "metadata.iloc[idx, 5]"
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "from importlib.machinery import SourceFileLoader\n",
     "\n",
     "\n",
     "ds = FocusDataSet(\"../data/focus/metadata.csv\", \"../data/focus/\")\n",
     "\n",
     "for d in ds:\n",
+    "    break\n",
     "\n",
     "d"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "for data in datamodule.test_dataloader():\n",
     "    break\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "import types\n",
     "import importlib.machinery\n",
     "\n",
     "model.step(data)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Benchmark in-memory and from disk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "iterations = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datamodule = FocusDataModule(data_dir=\"../data/focus150\", csv_file=\"../data/focus150/metadata.csv\")\n",
+    "datamodule.setup()\n",
+    "\n",
+    "\n",
+    "start = time.perf_counter()\n",
+    "counter = 0\n",
+    "for i in range(iterations):\n",
+    "    for data in datamodule.train_dataloader():\n",
+    "        counter += 1\n",
+    "\n",
+    "print(time.perf_counter() - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datamodule = FocusDataModule(data_dir=\"../data/focus150\", csv_file=\"../data/focus150/metadata.csv\", in_memory=False)\n",
+    "datamodule.setup()\n",
+    "\n",
+    "start = time.perf_counter()\n",
+    "counter = 0\n",
+    "for i in range(iterations):\n",
+    "    for data in datamodule.train_dataloader():\n",
+    "        counter += 1\n",
+    "print(time.perf_counter() - start)"
+   ]
   }
  ],
  "metadata": {

src/datamodules/focus_datamodule.py CHANGED Viewed

@@ -14,7 +14,7 @@ from torchvision.transforms import transforms
 class FocusDataSet(Dataset):
     """Dataset for z-stacked images of neglected tropical diseaeses."""
-    def __init__(self, csv_file, root_dir, transform=None):
         """Initialize focus satck dataset.
         Args:
@@ -24,11 +24,23 @@ class FocusDataSet(Dataset):
                 on a sample.
         """
         self.metadata = pd.read_csv(csv_file)
         self.col_index_path = self.metadata.columns.get_loc("image_path")
         self.col_index_focus = self.metadata.columns.get_loc("focus_value")
         self.root_dir = root_dir
         self.transform = transform
     def __len__(self) -> int:
         """Get the length of the dataset.
@@ -49,17 +61,19 @@ class FocusDataSet(Dataset):
         if torch.is_tensor(idx):
             idx = idx.tolist()
-        img_name = os.path.join(
-            self.root_dir, self.metadata.iloc[idx, self.col_index_path]
-        )
-        image = io.imread(img_name)
         focus_value = torch.from_numpy(
             np.asarray(self.metadata.iloc[idx, self.col_index_focus])
         ).float()
-        sample = {"image": image, "focus_value": focus_value}
-        if self.transform:
-            sample["image"] = self.transform(sample["image"])
         return sample
@@ -77,6 +91,7 @@ class FocusDataModule(LightningDataModule):
         batch_size: int = 64,
         num_workers: int = 0,
         pin_memory: bool = False,
     ):
         super().__init__()
@@ -91,6 +106,7 @@ class FocusDataModule(LightningDataModule):
         self.data_train: Optional[Dataset] = None
         self.data_val: Optional[Dataset] = None
         self.data_test: Optional[Dataset] = None
     def prepare_data(self):
         """This method is not implemented as of yet.
@@ -108,7 +124,10 @@ class FocusDataModule(LightningDataModule):
         # load datasets only if they're not loaded already
         if not self.data_train and not self.data_val and not self.data_test:
             dataset = FocusDataSet(
-                self.hparams.csv_file, self.hparams.data_dir, transform=self.transforms
             )
             train_length = int(
                 len(dataset) * self.hparams.train_val_test_split_percentage[0]

 class FocusDataSet(Dataset):
     """Dataset for z-stacked images of neglected tropical diseaeses."""
+    def __init__(self, csv_file, root_dir, transform=None, in_memory=True):
         """Initialize focus satck dataset.
         Args:
                 on a sample.
         """
         self.metadata = pd.read_csv(csv_file)
+        self.in_memory = in_memory
         self.col_index_path = self.metadata.columns.get_loc("image_path")
         self.col_index_focus = self.metadata.columns.get_loc("focus_value")
         self.root_dir = root_dir
         self.transform = transform
+        self.images = []
+        if self.in_memory:
+            self.images = np.array(
+                list(map(self._load_img, self.metadata["image_path"].tolist()))
+            )
+    def _load_img(self, img_path):
+        path = os.path.join(self.root_dir, img_path)
+        img = io.imread(path)
+        return img
     def __len__(self) -> int:
         """Get the length of the dataset.
         if torch.is_tensor(idx):
             idx = idx.tolist()
+        if self.in_memory:
+            image = self.images[idx]
+        else:
+            image = self._load_img(self.metadata.iloc[idx, self.col_index_path])
+        if self.transform:
+            image = self.transform(image)
         focus_value = torch.from_numpy(
             np.asarray(self.metadata.iloc[idx, self.col_index_focus])
         ).float()
+        sample = {"image": image, "focus_value": focus_value}
         return sample
         batch_size: int = 64,
         num_workers: int = 0,
         pin_memory: bool = False,
+        in_memory: bool = True,
     ):
         super().__init__()
         self.data_train: Optional[Dataset] = None
         self.data_val: Optional[Dataset] = None
         self.data_test: Optional[Dataset] = None
+        self.in_memory = in_memory
     def prepare_data(self):
         """This method is not implemented as of yet.
         # load datasets only if they're not loaded already
         if not self.data_train and not self.data_val and not self.data_test:
             dataset = FocusDataSet(
+                self.hparams.csv_file,
+                self.hparams.data_dir,
+                transform=self.transforms,
+                in_memory=self.in_memory,
             )
             train_length = int(
                 len(dataset) * self.hparams.train_val_test_split_percentage[0]