diff --git "a/training_testing_logs.ipynb" "b/training_testing_logs.ipynb"
new file mode 100644--- /dev/null
+++ "b/training_testing_logs.ipynb"
@@ -0,0 +1,9585 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a534038a",
+   "metadata": {},
+   "source": [
+    "# random test 20%"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9041ce76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import json\n",
+    "\n",
+    "def prepare_data_from_dataframe(df: pd.DataFrame, output_file: str) -> str:\n",
+    "    \"\"\"Prepare data for finetuning by reading from a DataFrame.\"\"\"\n",
+    "    data = []\n",
+    "\n",
+    "    # Process data\n",
+    "    for _, row in df.iterrows():\n",
+    "        image_path = f\"../data/{row['max_key']}/{row['path'].split('/')[-1]}\"\n",
+    "        data.append({\"image\": image_path, \"caption\": row['max_key']})\n",
+    "\n",
+    "    # Save the data in JSON format\n",
+    "    with open(output_file, \"w\") as f:\n",
+    "        for item in data:\n",
+    "            json.dump(item, f)\n",
+    "            f.write(\"\\n\")\n",
+    "\n",
+    "    return output_file\n",
+    "\n",
+    "\n",
+    "# Load the CSV file\n",
+    "df = pd.read_csv('labels.csv')\n",
+    "# Filter out specific categories\n",
+    "df = df[df['max_key'] != 'error']\n",
+    "df = df[df['max_key'] != 'a photo of other indoor space: not kitchen, not bathroom, not living room, not dining room, not foyer']\n",
+    "df = df[df['max_key'] != 'it is a artificial photo']\n",
+    "df = df[df['max_key'] != 'a photo of outdoor space']\n",
+    "# Filter samples with max_value > 0.9\n",
+    "threshold_df = df[df['max_value'] > 0.9]\n",
+    "\n",
+    "# Split data into train and test sets\n",
+    "train_df, test_df = train_test_split(threshold_df, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Now use the train_df to prepare your training data\n",
+    "train_all_json = prepare_data_from_dataframe(train_df, 'train_random.json')\n",
+    "# And test_df to prepare your testing data\n",
+    "test_json = prepare_data_from_dataframe(test_df, 'val_random.json')\n",
+    "\n",
+    "# The function prepare_data_from_dataframe remains unchanged\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "bbbb5b9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n",
+      "\n",
+      "KeyboardInterrupt\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# test from org ckpt\n",
+    "import pandas as pd\n",
+    "from collections import Counter\n",
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "def calculate_category_accuracy(true_labels, predicted_labels):\n",
+    "    \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n",
+    "    accuracies = {}\n",
+    "    true_labels_counter = Counter(true_labels)\n",
+    "    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n",
+    "    \n",
+    "    for label in true_labels_counter:\n",
+    "        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n",
+    "        accuracies[label] = accuracy\n",
+    "    \n",
+    "    return accuracies\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('val_random.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "\n",
+    "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = AutoModel.from_pretrained(repo_id)\n",
+    "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "\n",
+    "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "    batch_images = []\n",
+    "    valid_labels = []\n",
+    "    for path, label in zip(batch_paths, batch_labels):\n",
+    "        try:\n",
+    "            batch_images.append(Image.open(path))\n",
+    "            valid_labels.append(label)\n",
+    "        except (FileNotFoundError, UnidentifiedImageError):\n",
+    "            continue  # Skip images that cannot be opened\n",
+    "\n",
+    "    # Get predictions for the batch of images\n",
+    "    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "    all_predictions.extend(predicted_labels)\n",
+    "    all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "accuracy = correct_predictions / len(all_true_labels)\n",
+    "print(f\"Accuracy for model org: {accuracy * 100:.2f}%\\n\")\n",
+    "\n",
+    "# Calculate the accuracy for each category\n",
+    "category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n",
+    "\n",
+    "# Print the accuracy for each category\n",
+    "print(\"Accuracy for each category:\")\n",
+    "for category, accuracy in category_accuracies.items():\n",
+    "    print(f\"{category}: {accuracy:.2f}\")\n",
+    "\n",
+    "# Convert the dictionary to a DataFrame\n",
+    "category_accuracy_df = pd.DataFrame(list(category_accuracies.items()), columns=['Category', 'Accuracy'])\n",
+    "\n",
+    "# Save the DataFrame to a CSV file\n",
+    "category_accuracy_df.to_csv('category_accuracy.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13288c96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
+    "unique_labels = list(set(all_true_labels))\n",
+    "# Compute the confusion matrix\n",
+    "cm = confusion_matrix(all_true_labels, all_predictions, labels=unique_labels)\n",
+    "\n",
+    "# Display the confusion matrix\n",
+    "fig, ax = plt.subplots(figsize=(10, 10))  # Adjust the size as needed\n",
+    "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=unique_labels)\n",
+    "disp.plot(ax=ax)\n",
+    "\n",
+    "# Rotate the x-axis labels to display them vertically\n",
+    "plt.xticks(rotation=90)\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ca9fb408",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train file for this fold: train_fold_0.json\n",
+      "Test file for this fold: val_fold_0.json\n",
+      "Train file for this fold: train_fold_1.json\n",
+      "Test file for this fold: val_fold_1.json\n",
+      "Train file for this fold: train_fold_2.json\n",
+      "Test file for this fold: val_fold_2.json\n",
+      "Train file for this fold: train_fold_3.json\n",
+      "Test file for this fold: val_fold_3.json\n",
+      "Train file for this fold: train_fold_4.json\n",
+      "Test file for this fold: val_fold_4.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import KFold\n",
+    "import json\n",
+    "\n",
+    "def prepare_data_from_dataframe(df, output_file):\n",
+    "    \"\"\"Prepare data for fine-tuning by reading from a DataFrame.\"\"\"\n",
+    "    data = []\n",
+    "    # Process data\n",
+    "    for _, row in df.iterrows():\n",
+    "        image_path = f\"../data/{row['max_key']}/{row['path'].split('/')[-1]}\"\n",
+    "        data.append({\"image\": image_path, \"caption\": row['max_key']})\n",
+    "    # Save the data in JSON format\n",
+    "    with open(output_file, \"w\") as f:\n",
+    "        for item in data:\n",
+    "            json.dump(item, f)\n",
+    "            f.write(\"\\n\")\n",
+    "\n",
+    "def perform_k_fold(df, n_splits):\n",
+    "    \"\"\"Perform K-fold split and data preparation, including filtering by confidence thresholds.\"\"\"\n",
+    "    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)\n",
+    "    fold_data_info = []\n",
+    "    fold_counter = 0\n",
+    "\n",
+    "    for train_index, test_index in kf.split(df):\n",
+    "        train_df, test_df = df.iloc[train_index], df.iloc[test_index]\n",
+    "        test_json = f'val_fold_{fold_counter}.json'\n",
+    "        prepare_data_from_dataframe(test_df, test_json)\n",
+    "        \n",
+    "        # Creating JSON files for each confidence threshold from 0.91 to 0.99\n",
+    "        for threshold in range(90, 100):\n",
+    "            threshold_df = train_df[train_df['max_value'] > threshold / 100.0]\n",
+    "            threshold_train_json = f'train_fold_{fold_counter}_thr_{threshold}.json'\n",
+    "            prepare_data_from_dataframe(threshold_df, threshold_train_json)\n",
+    "            fold_data_info.append((threshold_train_json, test_json))\n",
+    "        \n",
+    "        fold_counter += 1\n",
+    "\n",
+    "    return fold_data_info\n",
+    "\n",
+    "# Load and preprocess the DataFrame\n",
+    "df = pd.read_csv('labels.csv')\n",
+    "df = df[df['max_key'] != 'error']\n",
+    "df = df[df['max_key'] != 'a photo of other indoor space: not kitchen, not bathroom, not living room, not dining room, not foyer']\n",
+    "df = df[df['max_key'] != 'it is a artificial photo']\n",
+    "df = df[df['max_key'] != 'a photo of outdoor space']\n",
+    "df = df[df['max_value'] > 0.9]\n",
+    "\n",
+    "# Perform the 5-fold split and data preparation including threshold filtering\n",
+    "fold_files = perform_k_fold(df, 5)\n",
+    "\n",
+    "# Print out the file names for each fold and threshold\n",
+    "for train_file, test_file in fold_files:\n",
+    "    print(f\"Train file for this fold and threshold: {train_file}\")\n",
+    "    print(f\"Test file for this fold: {test_file}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "ae907a13",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/pipelines/base.py:1123: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model on fold 0: 83.15%\n",
+      "\n",
+      "Accuracy for model on fold 1: 84.30%\n",
+      "\n",
+      "    Fold                             Category  Accuracy\n",
+      "0      0        a photo of contemporary foyer  0.804124\n",
+      "1      0          a photo of standard kitchen  1.000000\n",
+      "2      0     a photo of contemporary bathroom  0.667910\n",
+      "3      0            a photo of standard foyer  0.868794\n",
+      "4      0         a photo of standard bathroom  0.994197\n",
+      "5      0  a photo of contemporary dining room  0.952632\n",
+      "6      0      a photo of standard living room  0.986486\n",
+      "7      0  a photo of contemporary living room  0.534050\n",
+      "8      0      a photo of contemporary kitchen  0.689655\n",
+      "9      0      a photo of standard dining room  1.000000\n",
+      "10     0                              Overall  0.831540\n",
+      "11     1         a photo of standard bathroom  0.988539\n",
+      "12     1  a photo of contemporary dining room  0.957871\n",
+      "13     1  a photo of contemporary living room  0.597701\n",
+      "14     1            a photo of standard foyer  0.845896\n",
+      "15     1      a photo of contemporary kitchen  0.712446\n",
+      "16     1          a photo of standard kitchen  0.997375\n",
+      "17     1     a photo of contemporary bathroom  0.725490\n",
+      "18     1        a photo of contemporary foyer  0.802030\n",
+      "19     1      a photo of standard living room  0.968153\n",
+      "20     1      a photo of standard dining room  1.000000\n",
+      "21     1                              Overall  0.843030\n"
+     ]
+    }
+   ],
+   "source": [
+    "# test from org ckpt\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from collections import Counter\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "def calculate_category_accuracy(true_labels, predicted_labels):\n",
+    "    \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n",
+    "    accuracies = {}\n",
+    "    true_labels_counter = Counter(true_labels)\n",
+    "    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n",
+    "    \n",
+    "    for label in true_labels_counter:\n",
+    "        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n",
+    "        accuracies[label] = accuracy\n",
+    "    \n",
+    "    return accuracies\n",
+    "\n",
+    "# Load model components\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = AutoModel.from_pretrained(repo_id)\n",
+    "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n",
+    "\n",
+    "# DataFrame to store all accuracies\n",
+    "all_accuracies = []\n",
+    "\n",
+    "# Loop over each fold and test\n",
+    "for fold in range(2):\n",
+    "    # Load the JSON data for the current fold\n",
+    "    with open(f'val_fold_{fold}.json', 'r') as f:\n",
+    "        data = [json.loads(line) for line in f]\n",
+    "\n",
+    "    # Extract image paths and labels for the current fold\n",
+    "    image_paths = [item['image'] for item in data]\n",
+    "    labels = [item['caption'] for item in data]\n",
+    "\n",
+    "    BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "    all_predictions = []\n",
+    "    all_true_labels = []\n",
+    "\n",
+    "    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "        batch_images = []\n",
+    "        valid_labels = []\n",
+    "        for path, label in zip(batch_paths, batch_labels):\n",
+    "            try:\n",
+    "                batch_images.append(Image.open(path))\n",
+    "                valid_labels.append(label)\n",
+    "            except (FileNotFoundError, UnidentifiedImageError):\n",
+    "                continue  # Skip images that cannot be opened\n",
+    "\n",
+    "        # Get predictions for the batch of images\n",
+    "        predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "        predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "        all_predictions.extend(predicted_labels)\n",
+    "        all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "    accuracy = correct_predictions / len(all_true_labels)\n",
+    "    print(f\"Accuracy for model on fold {fold}: {accuracy * 100:.2f}%\\n\")\n",
+    "\n",
+    "    # Calculate the accuracy for each category for the current fold\n",
+    "    category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n",
+    "\n",
+    "    # Store the accuracies in a list of dictionaries\n",
+    "    for category, acc in category_accuracies.items():\n",
+    "        all_accuracies.append({'Fold': fold, 'Category': category, 'Accuracy': acc})\n",
+    "    # Add overall accuracy for the current fold to the list\n",
+    "    all_accuracies.append({'Fold': fold, 'Category': 'Overall', 'Accuracy': accuracy})\n",
+    "\n",
+    "# Convert the list of dictionaries to a DataFrame\n",
+    "all_accuracies_df = pd.DataFrame(all_accuracies)\n",
+    "\n",
+    "# Save the DataFrame to a CSV file\n",
+    "all_accuracies_df.to_csv('./result/org_results.csv', index=False)\n",
+    "\n",
+    "# Print out the final DataFrame\n",
+    "print(all_accuracies_df)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "87a5f0e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.90, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold1.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5fdf3bf8356042f0bae130c32b271de2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "217e723629f843f98f82644deed7b7c1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0518e4f2308e41ab972ea6c920f86270",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0a17075b076a4dc7a75f34f2fe599492",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'\n",
+      "04/16/2024 15:32:43 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe5b8694dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/16/2024 15:32:44 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe5b8694dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 18410/18410 [00:00<00:00, 27724.21 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 18185/18185 [00:00<00:00, 19921.85 e\n",
+      "  0%|                                                   | 0/470 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1128, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9296, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8839, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8626, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8445, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8329, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8247, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.821, 'learning_rate': 2e-07, 'epoch': 8.0}                           \n",
+      "{'loss': 2.8156, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.818, 'learning_rate': 0.0, 'epoch': 10.0}                            \n",
+      "{'train_runtime': 5000.2055, 'train_samples_per_second': 36.369, 'train_steps_per_second': 0.094, 'train_loss': 2.8745682412005484, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 470/470 [1:23:20<00:00, 10.64s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8746\n",
+      "  train_runtime            = 1:23:20.20\n",
+      "  train_samples_per_second =     36.369\n",
+      "  train_steps_per_second   =      0.094\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold1\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold2.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "38c113eb1dfe4d9ebf7b32f3326376e8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7791bdf95df7493f92c8d2d6f0869365",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "142a388ac1df44a4a2fd6a498fbc93c4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "753b156520c1465c8aa47cf9c685502c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'\n",
+      "04/16/2024 16:56:11 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f7777490dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/16/2024 16:56:13 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f7777490dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 16981/16981 [00:00<00:00, 27929.24 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 16772/16772 [00:00<00:00, 19787.47 e\n",
+      "  0%|                                                   | 0/430 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.124, 'learning_rate': 9e-07, 'epoch': 1.0}                           \n",
+      "{'loss': 2.9336, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8901, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8663, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8506, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8384, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8319, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8231, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8213, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8202, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4549.8459, 'train_samples_per_second': 36.863, 'train_steps_per_second': 0.095, 'train_loss': 2.8799443621968113, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 430/430 [1:15:49<00:00, 10.58s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8799\n",
+      "  train_runtime            = 1:15:49.84\n",
+      "  train_samples_per_second =     36.863\n",
+      "  train_steps_per_second   =      0.095\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold2\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold3.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bee210fcdc8c42328c06ffefbff9fcbc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4f945c4a55a64634a74d80ced79d1e06",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5ed4e883f2674f97a1a5a436ea1049bf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4bd3ef8f28504de18d510c16bad53a1d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'\n",
+      "04/16/2024 18:12:09 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb1052e0dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/16/2024 18:12:10 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb1052e0dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 15451/15451 [00:00<00:00, 28962.66 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 15258/15258 [00:00<00:00, 21247.67 e\n",
+      "  0%|                                                   | 0/390 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1249, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9372, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8912, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8686, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8503, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8397, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8334, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8271, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8268, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.82, 'learning_rate': 0.0, 'epoch': 10.0}                             \n",
+      "{'train_runtime': 3987.6216, 'train_samples_per_second': 38.263, 'train_steps_per_second': 0.098, 'train_loss': 2.881923577724359, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 390/390 [1:06:27<00:00, 10.22s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8819\n",
+      "  train_runtime            = 1:06:27.62\n",
+      "  train_samples_per_second =     38.263\n",
+      "  train_steps_per_second   =      0.098\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold3\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold4.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "83665d03d3e64c3abd4cfaa3caa82971",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "10b922f11d3d418dab043c475b4c69c0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f57f6db72d8c41569a63128774f8c3ba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0e8884c517214ee2af8f21b785c7b82d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'\n",
+      "04/16/2024 19:18:45 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f7ea3324ee0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/16/2024 19:18:46 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f7ea3324ee0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 13850/13850 [00:00<00:00, 29143.26 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 13679/13679 [00:00<00:00, 21199.23 e\n",
+      "  0%|                                                   | 0/350 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1247, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9428, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8945, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8685, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8509, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8402, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.836, 'learning_rate': 3e-07, 'epoch': 7.0}                           \n",
+      "{'loss': 2.8298, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8262, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8239, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 3584.5852, 'train_samples_per_second': 38.161, 'train_steps_per_second': 0.098, 'train_loss': 2.883748452322824, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 350/350 [59:44<00:00, 10.24s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8837\n",
+      "  train_runtime            = 0:59:44.58\n",
+      "  train_samples_per_second =     38.161\n",
+      "  train_steps_per_second   =      0.098\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold4\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold5.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7157deff57a74b6c8b5b6503962ba4e2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "73e2511d95614804b03c79d8fcea66a8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f35e50f336514c96b4206c74fb103589",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ffcf2ed3a8ca4fef83fe3e3905fc543e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'\n",
+      "04/16/2024 20:18:37 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f1718ff8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/16/2024 20:18:38 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f1718ff8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 12273/12273 [00:00<00:00, 29537.34 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 12122/12122 [00:00<00:00, 21387.33 e\n",
+      "  0%|                                                   | 0/310 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.143, 'learning_rate': 9e-07, 'epoch': 1.0}                           \n",
+      "{'loss': 2.9466, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9004, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8732, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8586, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8484, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.841, 'learning_rate': 3e-07, 'epoch': 7.0}                           \n",
+      "{'loss': 2.8363, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8333, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.833, 'learning_rate': 0.0, 'epoch': 10.0}                            \n",
+      "{'train_runtime': 3186.3617, 'train_samples_per_second': 38.043, 'train_steps_per_second': 0.097, 'train_loss': 2.8913872011246218, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 310/310 [53:06<00:00, 10.28s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8914\n",
+      "  train_runtime            = 0:53:06.36\n",
+      "  train_samples_per_second =     38.043\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold5\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold6.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f0cc8510c99a4235b0d377829345281d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b245a204344a4412be58b022c560e052",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2a706a767b534a00af00bc633dbb075d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "08f2d2a8d7154a9c9409fccbea847d8d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'\n",
+      "04/16/2024 21:11:51 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f50600b4dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/16/2024 21:11:52 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f50600b4dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 10547/10547 [00:00<00:00, 29168.34 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 10415/10415 [00:00<00:00, 21648.30 e\n",
+      "  0%|                                                   | 0/270 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1485, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9515, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9065, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8756, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8588, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8529, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8444, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8402, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8343, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8366, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2766.8228, 'train_samples_per_second': 37.642, 'train_steps_per_second': 0.098, 'train_loss': 2.8949241638183594, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 270/270 [46:06<00:00, 10.25s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8949\n",
+      "  train_runtime            = 0:46:06.82\n",
+      "  train_samples_per_second =     37.642\n",
+      "  train_steps_per_second   =      0.098\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold6\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold7.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1274e9aa864a4e679121fd49f1037bce",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ac03311baddd45faaab57c0b4f25d895",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "138ef77ade764236aac6e1655ed59368",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "af16d962c41544a997bcd69192fa1558",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'\n",
+      "04/16/2024 21:58:05 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fbd12afcdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/16/2024 21:58:06 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fbd12afcdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 8641/8641 [00:00<00:00, 28738.00 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 8534/8534 [00:00<00:00, 21546.58 exa\n",
+      "  0%|                                                   | 0/220 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1592, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9657, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9132, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8893, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8663, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8574, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8519, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8434, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8463, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8414, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2255.4402, 'train_samples_per_second': 37.837, 'train_steps_per_second': 0.098, 'train_loss': 2.90342003215443, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 220/220 [37:35<00:00, 10.25s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.9034\n",
+      "  train_runtime            = 0:37:35.44\n",
+      "  train_samples_per_second =     37.837\n",
+      "  train_steps_per_second   =      0.098\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold7\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold8.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0affb46f22bb4a4c82563fd4ed0e7bd1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3b4c0b0730b4476ebe0ed4fd16b07c55",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7e784ee4a67b4563b147892d0d2e8dd2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bc45e2f9feea4ff39ba8890cd807793e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'\n",
+      "04/16/2024 22:35:48 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7facd6864dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/16/2024 22:35:49 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7facd6864dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 6532/6532 [00:00<00:00, 28798.38 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 6445/6445 [00:00<00:00, 21433.02 exa\n",
+      "  0%|                                                   | 0/160 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1842, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9829, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9208, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.9018, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8811, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8685, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8624, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8543, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8572, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8502, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 1648.0811, 'train_samples_per_second': 39.106, 'train_steps_per_second': 0.097, 'train_loss': 2.9163528203964235, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 160/160 [27:28<00:00, 10.30s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.9164\n",
+      "  train_runtime            = 0:27:28.08\n",
+      "  train_samples_per_second =     39.106\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold8\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold9.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e23065ebd3f7413798cb3acc2afad5d2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9dfdf7f525924ccfb17bc61683841045",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "27472d826aa341e0a111401c4e296b66",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f56852bb98dc4a048d4ae62ace6b9585",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'\n",
+      "04/16/2024 23:03:23 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f85485ccdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/16/2024 23:03:24 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f85485ccdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 4144/4144 [00:00<00:00, 27507.68 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 4101/4101 [00:00<00:00, 21094.17 exa\n",
+      "  0%|                                                   | 0/100 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.2523, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 3.037, 'learning_rate': 8e-07, 'epoch': 2.0}                           \n",
+      "{'loss': 2.971, 'learning_rate': 7e-07, 'epoch': 3.0}                           \n",
+      "{'loss': 2.9345, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.9073, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8994, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8832, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8784, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8804, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8766, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 1043.7317, 'train_samples_per_second': 39.292, 'train_steps_per_second': 0.096, 'train_loss': 2.952003574371338, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 100/100 [17:23<00:00, 10.44s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =      2.952\n",
+      "  train_runtime            = 0:17:23.73\n",
+      "  train_samples_per_second =     39.292\n",
+      "  train_steps_per_second   =      0.096\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold9\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold10.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3bc856c2880f42b18306c6b675e2dfed",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "435c06f431b24006b4c07eed6875b0a9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9fa303b81afd4a439bae48276d78c803",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bdc723d63f8e43bf935b9cd175182cbf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'\n",
+      "04/16/2024 23:20:54 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe4dc188dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/16/2024 23:20:55 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe4dc188dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 1382/1382 [00:00<00:00, 24913.73 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 1366/1366 [00:00<00:00, 19082.86 exa\n",
+      "  0%|                                                    | 0/30 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.3162, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 3.1668, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 3.0819, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 3.0353, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.9945, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.9998, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.9528, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.9482, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.9491, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.9433, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 315.7361, 'train_samples_per_second': 43.264, 'train_steps_per_second': 0.095, 'train_loss': 3.0387779235839845, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████████| 30/30 [05:15<00:00, 10.52s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     3.0388\n",
+      "  train_runtime            = 0:05:15.73\n",
+      "  train_samples_per_second =     43.264\n",
+      "  train_steps_per_second   =      0.095\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold10\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 1, threshold > 0.90, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold1.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8c21d6aaa83b4d5c81f627e540fd2c6e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9f31a56e5f63403dbc5ea5b0c4b604ea",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e396dac2fd794257924e8b7b18ed2c95",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1c4d76b19ac44e779e667b5da1409a40",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'\n",
+      "04/16/2024 23:26:16 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f4140184dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/16/2024 23:26:17 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f4140184dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 18410/18410 [00:00<00:00, 29787.21 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 18195/18195 [00:00<00:00, 21378.62 e\n",
+      "  0%|                                                   | 0/470 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1111, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9297, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8829, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8577, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.842, 'learning_rate': 5e-07, 'epoch': 5.0}                           \n",
+      "{'loss': 2.8301, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8235, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8178, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8139, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8137, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4842.9455, 'train_samples_per_second': 37.57, 'train_steps_per_second': 0.097, 'train_loss': 2.8722301401990524, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 470/470 [1:20:42<00:00, 10.30s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8722\n",
+      "  train_runtime            = 1:20:42.94\n",
+      "  train_samples_per_second =      37.57\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold1\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 1, threshold > 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold2.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6f222218592945ec81ec10a459b8911f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "685f937196da4f44bcac474b5feed66f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dcf43ed2f874415697be076c9ce6e672",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "863cafd9a11e4b9aba23f75324c8a71e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'\n",
+      "04/17/2024 00:47:07 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fd981c54dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 00:47:08 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fd981c54dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 16909/16909 [00:00<00:00, 29673.49 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 16715/16715 [00:00<00:00, 21582.93 e\n",
+      "  0%|                                                   | 0/430 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1235, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9345, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8845, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.86, 'learning_rate': 6e-07, 'epoch': 4.0}                            \n",
+      "{'loss': 2.843, 'learning_rate': 5e-07, 'epoch': 5.0}                           \n",
+      "{'loss': 2.8351, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8282, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.825, 'learning_rate': 2e-07, 'epoch': 8.0}                           \n",
+      "{'loss': 2.8169, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8181, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4414.8258, 'train_samples_per_second': 37.861, 'train_steps_per_second': 0.097, 'train_loss': 2.8768707275390626, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 430/430 [1:13:34<00:00, 10.27s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8769\n",
+      "  train_runtime            = 1:13:34.82\n",
+      "  train_samples_per_second =     37.861\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold2\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 1, threshold > 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold3.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "17eb26e0a1a44ecb9b3baed49b93d356",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cb69d33d65f9404495afe5ab968990e4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "57229a01c4f24a36a40d2ad6d14c82c9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f2bc11ffd779455fac0e867590c32bfb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'\n",
+      "04/17/2024 02:00:49 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb794454dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 02:00:51 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb794454dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 15411/15411 [00:00<00:00, 29610.96 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 15237/15237 [00:00<00:00, 21476.83 e\n",
+      "  0%|                                                   | 0/390 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1232, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9345, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8848, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8607, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8491, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8364, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.828, 'learning_rate': 3e-07, 'epoch': 7.0}                           \n",
+      "{'loss': 2.8264, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.819, 'learning_rate': 1e-07, 'epoch': 9.0}                           \n",
+      "{'loss': 2.8205, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4010.1813, 'train_samples_per_second': 37.996, 'train_steps_per_second': 0.097, 'train_loss': 2.878255149645683, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 390/390 [1:06:50<00:00, 10.28s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8783\n",
+      "  train_runtime            = 1:06:50.18\n",
+      "  train_samples_per_second =     37.996\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold3\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 1, threshold > 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold4.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "12eef0873845493f8e1df68e0fc76a78",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "11c1d8bc23c54fa8b7a54a9ce130ae0b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3eb793a2a38c4c3f85b2d6e3a238620c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8c40a30646744ccaae67553e77d237fd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'\n",
+      "04/17/2024 03:07:47 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f75445e8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 03:07:48 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f75445e8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 13823/13823 [00:00<00:00, 29426.80 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 13667/13667 [00:00<00:00, 21484.43 e\n",
+      "  0%|                                                   | 0/350 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1317, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9362, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8892, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8612, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8485, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8351, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8294, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8283, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8202, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8217, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 3609.4952, 'train_samples_per_second': 37.864, 'train_steps_per_second': 0.097, 'train_loss': 2.88013669695173, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 350/350 [1:00:09<00:00, 10.31s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8801\n",
+      "  train_runtime            = 1:00:09.49\n",
+      "  train_samples_per_second =     37.864\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold4\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 1, threshold > 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold5.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e6af79e64d0c4ff093bbbcabc25c028a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ef3c79bd02294f2983015b3c266ff236",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ebc876e5572246e497bf232b873477f1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5c37becec61e429b937db9602c3b96d7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'\n",
+      "04/17/2024 04:08:04 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7ff0c1b6cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 04:08:05 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7ff0c1b6cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 12236/12236 [00:00<00:00, 29435.37 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 12097/12097 [00:00<00:00, 21528.46 e\n",
+      "  0%|                                                   | 0/310 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1422, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9471, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8954, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8699, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8552, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8445, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.839, 'learning_rate': 3e-07, 'epoch': 7.0}                           \n",
+      "{'loss': 2.8325, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8279, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8249, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 3193.8142, 'train_samples_per_second': 37.876, 'train_steps_per_second': 0.097, 'train_loss': 2.8878452670189643, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 310/310 [53:13<00:00, 10.30s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8878\n",
+      "  train_runtime            = 0:53:13.81\n",
+      "  train_samples_per_second =     37.876\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold5\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 1, threshold > 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold6.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b4a604a4942741968bd5eba3ba4491a6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1b69ddbb9df74f3096fcc3ac433ad576",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fd261cce71324b0c9ff91466f7d62737",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0e40a4fd616445ecb74ee0619359a489",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'\n",
+      "04/17/2024 05:01:25 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fab9bd64dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 05:01:26 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fab9bd64dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 10506/10506 [00:00<00:00, 29337.22 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 10385/10385 [00:00<00:00, 21456.00 e\n",
+      "  0%|                                                   | 0/270 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.144, 'learning_rate': 9e-07, 'epoch': 1.0}                           \n",
+      "{'loss': 2.9529, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9006, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8747, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8614, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.845, 'learning_rate': 4e-07, 'epoch': 6.0}                           \n",
+      "{'loss': 2.8385, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.835, 'learning_rate': 2e-07, 'epoch': 8.0}                           \n",
+      "{'loss': 2.833, 'learning_rate': 1e-07, 'epoch': 9.0}                           \n",
+      "{'loss': 2.8311, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2782.2498, 'train_samples_per_second': 37.326, 'train_steps_per_second': 0.097, 'train_loss': 2.891623122603805, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 270/270 [46:22<00:00, 10.30s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8916\n",
+      "  train_runtime            = 0:46:22.24\n",
+      "  train_samples_per_second =     37.326\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold6\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 1, threshold > 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold7.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0eae5263eb204eb48b0680faf043a3ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "da1011e0d0584c7ea9a7bfb6b42e9137",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7726bd6f5b654e8499351918cf0e6b37",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "970d51cdc4044d8d96c1fb11424e88c9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'\n",
+      "04/17/2024 05:47:55 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f6b9a964dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 05:47:56 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f6b9a964dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 8620/8620 [00:00<00:00, 29092.75 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 8522/8522 [00:00<00:00, 21526.17 exa\n",
+      "  0%|                                                   | 0/220 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1682, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9581, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9064, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.88, 'learning_rate': 6e-07, 'epoch': 4.0}                            \n",
+      "{'loss': 2.8666, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8547, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8433, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8399, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8369, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8357, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2271.0176, 'train_samples_per_second': 37.525, 'train_steps_per_second': 0.097, 'train_loss': 2.8989675868641247, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 220/220 [37:51<00:00, 10.32s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =      2.899\n",
+      "  train_runtime            = 0:37:51.01\n",
+      "  train_samples_per_second =     37.525\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold7\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 1, threshold > 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold8.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7b341f15b4ba4bd693267d935409342c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dcac9c18c5aa45c4b41a115422d61d1f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "73fc03c744f74aa0b920cf319b20a306",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eef74a82ab0b4fca8115a04aef8f98ee",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'\n",
+      "04/17/2024 06:25:53 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fa42c540dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 06:25:54 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fa42c540dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 6483/6483 [00:00<00:00, 28558.51 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 6407/6407 [00:00<00:00, 21230.77 exa\n",
+      "  0%|                                                   | 0/160 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1928, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.988, 'learning_rate': 8e-07, 'epoch': 2.0}                           \n",
+      "{'loss': 2.9295, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8963, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8795, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8679, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8546, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8506, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8513, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8474, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 1654.7627, 'train_samples_per_second': 38.719, 'train_steps_per_second': 0.097, 'train_loss': 2.915772485733032, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 160/160 [27:34<00:00, 10.34s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.9158\n",
+      "  train_runtime            = 0:27:34.76\n",
+      "  train_samples_per_second =     38.719\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold8\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 1, threshold > 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold9.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9791e95e1cac4b96a2780146c771a15b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "900df58146984b42b6344ed203b0a7d3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "997cfc6464ad42f4af344144b05b97b4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f20e12e613b9450b8bb92250ca11c9e2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'\n",
+      "04/17/2024 06:53:35 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f334c220dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 06:53:36 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f334c220dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 4130/4130 [00:00<00:00, 27951.33 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 4098/4098 [00:00<00:00, 21248.01 exa\n",
+      "  0%|                                                   | 0/100 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.2369, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 3.0299, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9588, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.9176, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.9002, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8889, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8884, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8721, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8729, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8746, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 1044.996, 'train_samples_per_second': 39.215, 'train_steps_per_second': 0.096, 'train_loss': 2.9440200233459475, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 100/100 [17:24<00:00, 10.45s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =      2.944\n",
+      "  train_runtime            = 0:17:24.99\n",
+      "  train_samples_per_second =     39.215\n",
+      "  train_steps_per_second   =      0.096\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold9\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 1, threshold > 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold10.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "800bcb4797c6476ab149f9a6d7b0ac9a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "28a184811871465da5427caf4f372785",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "76cb54639edc4517a752aeebe54e2af1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8de36265f6b04626b1ad9b7d37e4f167",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'\n",
+      "04/17/2024 07:11:07 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fc16ff28dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 07:11:08 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fc16ff28dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 1366/1366 [00:00<00:00, 24988.85 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 1357/1357 [00:00<00:00, 20474.15 exa\n",
+      "  0%|                                                    | 0/30 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.2915, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 3.1362, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 3.0763, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.9976, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.973, 'learning_rate': 5e-07, 'epoch': 5.0}                           \n",
+      "{'loss': 2.9411, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.9475, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.9303, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.9012, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.9236, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 317.9658, 'train_samples_per_second': 42.678, 'train_steps_per_second': 0.094, 'train_loss': 3.011846987406413, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████████| 30/30 [05:17<00:00, 10.60s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     3.0118\n",
+      "  train_runtime            = 0:05:17.96\n",
+      "  train_samples_per_second =     42.678\n",
+      "  train_steps_per_second   =      0.094\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold10\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 2, threshold > 0.90, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold1.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "90681f289d6d4fed89378db98f216ecc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "09692754086146dcad890b2d5a1c6266",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ee661ed4ec1d497fb5c455cbc84c2d59",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "563502f7ae1242dca67b8cb06d819293",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/17/2024 07:16:31 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fdbb9f04dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 07:16:32 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fdbb9f04dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 18410/18410 [00:00<00:00, 29189.51 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 18200/18200 [00:00<00:00, 21406.86 e\n",
+      "  0%|                                                   | 0/470 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1179, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9326, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8848, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8607, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8437, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8292, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8239, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8205, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8148, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8138, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4829.773, 'train_samples_per_second': 37.683, 'train_steps_per_second': 0.097, 'train_loss': 2.8741772590799535, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 470/470 [1:20:29<00:00, 10.28s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8742\n",
+      "  train_runtime            = 1:20:29.77\n",
+      "  train_samples_per_second =     37.683\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold1\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 2, threshold > 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold2.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aaf9a0edbf214eeb86c4542ace684120",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6f6f55c9f78241e9b8a5fa65054b6d6a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c9ba55f5e7a24d8683d4d8d91f5f42f6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d9831f9ffc6a45298604e2c24bacc497",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/17/2024 08:37:09 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f6608794dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 08:37:10 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f6608794dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 16939/16939 [00:00<00:00, 29678.72 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 16749/16749 [00:00<00:00, 21520.55 e\n",
+      "  0%|                                                   | 0/430 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1216, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9348, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.889, 'learning_rate': 7e-07, 'epoch': 3.0}                           \n",
+      "{'loss': 2.8623, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8479, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8334, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8251, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8219, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8178, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8157, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4404.4474, 'train_samples_per_second': 38.027, 'train_steps_per_second': 0.098, 'train_loss': 2.876948902218841, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 430/430 [1:13:24<00:00, 10.24s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8769\n",
+      "  train_runtime            = 1:13:24.44\n",
+      "  train_samples_per_second =     38.027\n",
+      "  train_steps_per_second   =      0.098\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold2\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 2, threshold > 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold3.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b5fb3e24d752404ab6285054e9599642",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "79fad27001a147bd85cf93f8ddecd47d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ffb50add10da455d920101ad334afc57",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a95477a9d8904088a7170be7cb24083f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/17/2024 09:50:41 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f5408ec8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 09:50:42 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f5408ec8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 15439/15439 [00:00<00:00, 28950.30 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 15263/15263 [00:00<00:00, 21450.38 e\n",
+      "  0%|                                                   | 0/390 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1272, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.936, 'learning_rate': 8e-07, 'epoch': 2.0}                           \n",
+      "{'loss': 2.8881, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8618, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8478, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8358, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8293, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8238, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8191, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8195, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4036.7591, 'train_samples_per_second': 37.81, 'train_steps_per_second': 0.097, 'train_loss': 2.878826649983724, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 390/390 [1:07:16<00:00, 10.35s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8788\n",
+      "  train_runtime            = 1:07:16.75\n",
+      "  train_samples_per_second =      37.81\n",
+      "  train_steps_per_second   =      0.097\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold3\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 2, threshold > 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold4.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "84d9f68808ee4443b5276d9dd47eadd0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4255ce9e0a90438082d29fd1af9e4c70",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6495d81660a8442dac7b75d03f61cd21",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0859f5ff8cb34342be55fd64269b101d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/17/2024 10:58:06 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb739f48dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 10:58:07 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb739f48dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 13827/13827 [00:00<00:00, 28941.77 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 13671/13671 [00:00<00:00, 21331.84 e\n",
+      "  0%|                                                   | 0/350 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1357, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9388, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8912, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.864, 'learning_rate': 6e-07, 'epoch': 4.0}                           \n",
+      "{'loss': 2.8456, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8359, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8279, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8248, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8202, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8206, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 3863.2472, 'train_samples_per_second': 35.387, 'train_steps_per_second': 0.091, 'train_loss': 2.8804617745535714, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 350/350 [1:04:23<00:00, 11.04s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8805\n",
+      "  train_runtime            = 1:04:23.24\n",
+      "  train_samples_per_second =     35.387\n",
+      "  train_steps_per_second   =      0.091\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold4\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 2, threshold > 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold5.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "84349bd58f7d4d7fa601966cf7d860bb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "54d9e5127148480199ae1bb46efcc394",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9bab39de25694b578a0f1ab2a1f470d9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ab99c7f5dd1945738cd32eac65da0dbc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/17/2024 12:02:37 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f30bc114dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 12:02:38 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f30bc114dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 12224/12224 [00:00<00:00, 28843.71 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 12087/12087 [00:00<00:00, 20386.48 e\n",
+      "  0%|                                                   | 0/310 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1495, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9512, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8971, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.872, 'learning_rate': 6e-07, 'epoch': 4.0}                           \n",
+      "{'loss': 2.8551, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8439, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8354, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.835, 'learning_rate': 2e-07, 'epoch': 8.0}                           \n",
+      "{'loss': 2.8292, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8272, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 3259.5725, 'train_samples_per_second': 37.082, 'train_steps_per_second': 0.095, 'train_loss': 2.8895658185405115, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 310/310 [54:19<00:00, 10.51s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8896\n",
+      "  train_runtime            = 0:54:19.57\n",
+      "  train_samples_per_second =     37.082\n",
+      "  train_steps_per_second   =      0.095\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold5\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 2, threshold > 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold6.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a16022848e524c01bbd1745df4ea9ffb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ed906ad7fa144e35b6bbf6928f82a939",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8238b39707fe4f94af3ce0c079712296",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "45382d89a1d949b79edd6b352a0b729b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/17/2024 12:57:04 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f0a80500dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 12:57:05 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f0a80500dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 10488/10488 [00:00<00:00, 28834.60 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 10369/10369 [00:00<00:00, 21240.58 e\n",
+      "  0%|                                                   | 0/270 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.155, 'learning_rate': 9e-07, 'epoch': 1.0}                           \n",
+      "{'loss': 2.9565, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9022, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.877, 'learning_rate': 6e-07, 'epoch': 4.0}                           \n",
+      "{'loss': 2.8574, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8497, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8423, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8416, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.835, 'learning_rate': 1e-07, 'epoch': 9.0}                           \n",
+      "{'loss': 2.8374, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2926.6708, 'train_samples_per_second': 35.429, 'train_steps_per_second': 0.092, 'train_loss': 2.89542728000217, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 270/270 [48:46<00:00, 10.84s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8954\n",
+      "  train_runtime            = 0:48:46.67\n",
+      "  train_samples_per_second =     35.429\n",
+      "  train_steps_per_second   =      0.092\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold6\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 2, threshold > 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold7.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a3ad1ffc64204f17af1b1161696b5f06",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2898e76269b94c9ba99c6982584192ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cbe3e55e43af4292a1c981a6af301483",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "55b7244b4e84449d8208dab3519b5e92",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/17/2024 13:45:58 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f0ca4de8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 13:45:59 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f0ca4de8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 8586/8586 [00:00<00:00, 27862.04 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 8492/8492 [00:00<00:00, 20458.35 exa\n",
+      "  0%|                                                   | 0/220 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1604, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9659, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9069, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8841, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8648, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8528, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.843, 'learning_rate': 3e-07, 'epoch': 7.0}                           \n",
+      "{'loss': 2.8404, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8415, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8358, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2395.7614, 'train_samples_per_second': 35.446, 'train_steps_per_second': 0.092, 'train_loss': 2.899563303860751, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 220/220 [39:55<00:00, 10.89s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8996\n",
+      "  train_runtime            = 0:39:55.76\n",
+      "  train_samples_per_second =     35.446\n",
+      "  train_steps_per_second   =      0.092\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold7\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 2, threshold > 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold8.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "674cde2bd4314d26b4de3866f7a9e791",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "17152a7c2477404cbf078097f8b9d2bf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "29da6d1f04bc4b5d817eb994c639615f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b9f25cbe390a4aacaecfe2641da4e75f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/17/2024 14:26:01 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f7763730dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 14:26:02 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f7763730dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 6497/6497 [00:00<00:00, 27474.72 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 6425/6425 [00:00<00:00, 19865.38 exa\n",
+      "  0%|                                                   | 0/160 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.186, 'learning_rate': 9e-07, 'epoch': 1.0}                           \n",
+      "{'loss': 2.981, 'learning_rate': 8e-07, 'epoch': 2.0}                           \n",
+      "{'loss': 2.9262, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8907, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8815, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8675, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8573, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.852, 'learning_rate': 2e-07, 'epoch': 8.0}                           \n",
+      "{'loss': 2.8569, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8478, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 1763.6576, 'train_samples_per_second': 36.43, 'train_steps_per_second': 0.091, 'train_loss': 2.9146875619888304, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 160/160 [29:23<00:00, 11.02s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.9147\n",
+      "  train_runtime            = 0:29:23.65\n",
+      "  train_samples_per_second =      36.43\n",
+      "  train_steps_per_second   =      0.091\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold8\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 2, threshold > 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold9.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "217c9582f07a430b996e7c682c0c2427",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0d303a3552674e9594f84e72f94fe901",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3deabe6ab76d4aacb021b8c7792d1ed0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "88f20aec28ac4bca8f3bb7d519f46322",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/17/2024 14:55:33 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f8c8f214dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 14:55:34 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f8c8f214dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 4151/4151 [00:00<00:00, 25735.84 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 4118/4118 [00:00<00:00, 18134.09 exa\n",
+      "  0%|                                                   | 0/100 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.2318, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 3.0349, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9728, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.9344, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.9138, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8937, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8847, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8779, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8777, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8768, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 1098.5101, 'train_samples_per_second': 37.487, 'train_steps_per_second': 0.091, 'train_loss': 2.9498478889465334, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 100/100 [18:18<00:00, 10.99s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.9498\n",
+      "  train_runtime            = 0:18:18.51\n",
+      "  train_samples_per_second =     37.487\n",
+      "  train_steps_per_second   =      0.091\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold9\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 2, threshold > 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold10.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "df243dbdb303470cbbf92dc21b89a011",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6d96384cd9494439bbbea9f656549b89",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a6a5919715d048ee93eabb6880c19846",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4608d66bcdaa4d4c9112c54f872733d4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/17/2024 15:13:59 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fc02cd34dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 15:14:00 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fc02cd34dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 1357/1357 [00:00<00:00, 25011.41 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 1346/1346 [00:00<00:00, 19520.80 exa\n",
+      "  0%|                                                    | 0/30 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.3296, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 3.1602, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 3.0947, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 3.0176, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.9972, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.9861, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.9694, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.9601, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.9434, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.9609, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 324.6853, 'train_samples_per_second': 41.456, 'train_steps_per_second': 0.092, 'train_loss': 3.0419299443562826, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████████| 30/30 [05:24<00:00, 10.82s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     3.0419\n",
+      "  train_runtime            = 0:05:24.68\n",
+      "  train_samples_per_second =     41.456\n",
+      "  train_steps_per_second   =      0.092\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold10\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 3, threshold > 0.90, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold1.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a92feebeb8984168a4d7bfd1308c7a1c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "82115be16c2c4ad0a04af562b49ebfd7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0a9df1db653d4dcbb7e4a298f5f6d275",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5241cba78f074a9bab1691978e9b346a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'\n",
+      "04/17/2024 15:19:30 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f2db0ff8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 15:19:31 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f2db0ff8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 18411/18411 [00:00<00:00, 27826.86 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 18189/18189 [00:00<00:00, 20605.62 e\n",
+      "  0%|                                                   | 0/470 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1125, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9313, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8839, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8626, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8437, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8323, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8251, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8194, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8166, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8135, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 5069.8297, 'train_samples_per_second': 35.877, 'train_steps_per_second': 0.093, 'train_loss': 2.874101841703374, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 470/470 [1:24:29<00:00, 10.79s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8741\n",
+      "  train_runtime            = 1:24:29.82\n",
+      "  train_samples_per_second =     35.877\n",
+      "  train_steps_per_second   =      0.093\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold1\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 3, threshold > 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold2.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "54089abc4acd42159cc4fd0ff6a2e7da",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f7b89c597ba549bbaa503a89ad74cc45",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5b956b9a6c144d4daaf6a35f49ca95e9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e55327a4514a4cf58912d011aab19f2d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'\n",
+      "04/17/2024 16:44:08 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe3000d4dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 16:44:10 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe3000d4dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 16936/16936 [00:00<00:00, 28639.94 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 16736/16736 [00:00<00:00, 20420.36 e\n",
+      "  0%|                                                   | 0/430 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1217, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.935, 'learning_rate': 8e-07, 'epoch': 2.0}                           \n",
+      "{'loss': 2.8864, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8654, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8527, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8378, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8302, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8266, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8206, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8192, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4637.4844, 'train_samples_per_second': 36.089, 'train_steps_per_second': 0.093, 'train_loss': 2.8795754366142807, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 430/430 [1:17:17<00:00, 10.78s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8796\n",
+      "  train_runtime            = 1:17:17.48\n",
+      "  train_samples_per_second =     36.089\n",
+      "  train_steps_per_second   =      0.093\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold2\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 3, threshold > 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold3.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a0b05d1fb6a54fd3ae19087a28b4570c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8f30b3cf35dc469b99a15203a0bed396",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b5a939df367d405aacf5ce540a833595",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "742c637a0c644c7ea44a48aa09b96c78",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'\n",
+      "04/17/2024 18:01:34 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fc20a414dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 18:01:35 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fc20a414dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 15422/15422 [00:00<00:00, 29274.55 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 15240/15240 [00:00<00:00, 20987.70 e\n",
+      "  0%|                                                   | 0/390 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1276, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9378, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8892, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8646, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8497, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8379, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8334, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8293, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8213, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8202, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4071.4868, 'train_samples_per_second': 37.431, 'train_steps_per_second': 0.096, 'train_loss': 2.8811082301995694, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 390/390 [1:07:51<00:00, 10.44s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8811\n",
+      "  train_runtime            = 1:07:51.48\n",
+      "  train_samples_per_second =     37.431\n",
+      "  train_steps_per_second   =      0.096\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold3\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 3, threshold > 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold4.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "96b0744bcdb94f0c95b2dd38720b8a2e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8ba1ae491fb94f41ba6ebd38237edf51",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2bee0d3d0781480db4083bb3a8f353ba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e93216cbce3d46eab002c59b4d9cd59d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'\n",
+      "04/17/2024 19:09:34 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f04b3e38dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 19:09:35 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f04b3e38dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 13849/13849 [00:00<00:00, 28925.33 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 13688/13688 [00:00<00:00, 21104.87 e\n",
+      "  0%|                                                   | 0/350 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1344, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9381, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8884, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8674, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8484, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8362, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8305, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8262, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8212, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.819, 'learning_rate': 0.0, 'epoch': 10.0}                            \n",
+      "{'train_runtime': 3651.4322, 'train_samples_per_second': 37.487, 'train_steps_per_second': 0.096, 'train_loss': 2.8809766714913505, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 350/350 [1:00:51<00:00, 10.43s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =      2.881\n",
+      "  train_runtime            = 1:00:51.43\n",
+      "  train_samples_per_second =     37.487\n",
+      "  train_steps_per_second   =      0.096\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold4\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 3, threshold > 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold5.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b8edfee587dc4346a6172beb21b42507",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a306c5afb7b473ab8f0cfa4729a6f2a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2a6c8c4aa91043a884d97eecb47421e3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "730c4a58f93d43fc9aabe58236b47c2f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'\n",
+      "04/17/2024 20:10:33 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f48437e0dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 20:10:34 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f48437e0dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 12276/12276 [00:00<00:00, 29087.75 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 12136/12136 [00:00<00:00, 21124.55 e\n",
+      "  0%|                                                   | 0/310 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1427, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.954, 'learning_rate': 8e-07, 'epoch': 2.0}                           \n",
+      "{'loss': 2.906, 'learning_rate': 7e-07, 'epoch': 3.0}                           \n",
+      "{'loss': 2.8737, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8611, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8473, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.845, 'learning_rate': 3e-07, 'epoch': 7.0}                           \n",
+      "{'loss': 2.8374, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8345, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8338, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 3246.1202, 'train_samples_per_second': 37.386, 'train_steps_per_second': 0.095, 'train_loss': 2.8935652948194934, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 310/310 [54:06<00:00, 10.47s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8936\n",
+      "  train_runtime            = 0:54:06.12\n",
+      "  train_samples_per_second =     37.386\n",
+      "  train_steps_per_second   =      0.095\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold5\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 3, threshold > 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold6.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b554835e6cf04eb9b0ba600032a97241",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "95ed3d1e636f4acd820a4cc5ce43a0fe",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5d65cf9afe9a482b90de4d69699adeb0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b682e7840b2a49adb978884fe7548f62",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'\n",
+      "04/17/2024 21:04:47 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe8ad57cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 21:04:48 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe8ad57cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████��█████| 10542/10542 [00:00<00:00, 28562.65 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 10420/10420 [00:00<00:00, 21241.45 e\n",
+      "  0%|                                                   | 0/270 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1515, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.958, 'learning_rate': 8e-07, 'epoch': 2.0}                           \n",
+      "{'loss': 2.9053, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8771, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8623, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8556, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8463, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8387, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8377, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8351, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2813.799, 'train_samples_per_second': 37.032, 'train_steps_per_second': 0.096, 'train_loss': 2.8967636673538775, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 270/270 [46:53<00:00, 10.42s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8968\n",
+      "  train_runtime            = 0:46:53.79\n",
+      "  train_samples_per_second =     37.032\n",
+      "  train_steps_per_second   =      0.096\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold6\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 3, threshold > 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold7.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0d313401328441408ea1138c9c20a9fe",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eda808aae6a5456f947aa154166a55ef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b385161d69a346308b8ca32c7d24c19e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2873716f8d2843f49de41f7fdf3924ef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'\n",
+      "04/17/2024 21:51:48 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f4e1b01cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 21:51:49 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f4e1b01cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 8644/8644 [00:00<00:00, 28711.30 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 8546/8546 [00:00<00:00, 21136.89 exa\n",
+      "  0%|                                                   | 0/220 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1591, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9654, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9102, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8858, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8679, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8606, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8502, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8458, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8444, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8426, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2299.8291, 'train_samples_per_second': 37.159, 'train_steps_per_second': 0.096, 'train_loss': 2.903182203119451, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 220/220 [38:19<00:00, 10.45s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.9032\n",
+      "  train_runtime            = 0:38:19.82\n",
+      "  train_samples_per_second =     37.159\n",
+      "  train_steps_per_second   =      0.096\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold7\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 3, threshold > 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold8.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "07e4b9e62ab74cd9840b19cc4366f7f5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d811a7a0b0034cf6aa10c66eada5f0ba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "700945164a6c498d92871f4e8e5b0a37",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2ba0851b176f44139ca072f594cd26f9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'\n",
+      "04/17/2024 22:30:15 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f539c220dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 22:30:16 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f539c220dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 6509/6509 [00:00<00:00, 28510.06 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 6434/6434 [00:00<00:00, 21116.37 exa\n",
+      "  0%|                                                   | 0/160 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1949, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9889, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9326, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.9016, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8828, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8768, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8647, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8559, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8554, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8529, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 1675.4817, 'train_samples_per_second': 38.401, 'train_steps_per_second': 0.095, 'train_loss': 2.920655083656311, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 160/160 [27:55<00:00, 10.47s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.9207\n",
+      "  train_runtime            = 0:27:55.48\n",
+      "  train_samples_per_second =     38.401\n",
+      "  train_steps_per_second   =      0.095\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold8\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 3, threshold > 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold9.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b00d47a587bb4172ae9c6a136227ebfd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "961a1153a90149e2834bb6d2736820d4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9f9f9c42a67545e4b021a52f6283d97a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "52441c3309414a23a3bd49869efa8b42",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'\n",
+      "04/17/2024 22:58:18 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f1af9e20dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 22:58:19 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f1af9e20dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 4145/4145 [00:00<00:00, 27460.08 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 4104/4104 [00:00<00:00, 21182.78 exa\n",
+      "  0%|                                                   | 0/100 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.2377, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 3.03, 'learning_rate': 8e-07, 'epoch': 2.0}                            \n",
+      "{'loss': 2.9677, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.923, 'learning_rate': 6e-07, 'epoch': 4.0}                           \n",
+      "{'loss': 2.8979, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8889, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8798, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8773, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.876, 'learning_rate': 1e-07, 'epoch': 9.0}                           \n",
+      "{'loss': 2.8707, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 1063.4285, 'train_samples_per_second': 38.592, 'train_steps_per_second': 0.094, 'train_loss': 2.9449002075195314, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 100/100 [17:43<00:00, 10.63s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.9449\n",
+      "  train_runtime            = 0:17:43.42\n",
+      "  train_samples_per_second =     38.592\n",
+      "  train_steps_per_second   =      0.094\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold9\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 3, threshold > 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold10.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "39d2705fe798440b8a5ef596a2fa15b2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c0581e81110044adbc4bbacda74ddf99",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "46557c811d884ad6b2da077269e2ff7d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1c36315e496e4766a9f2b2d3bcaf88de",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'\n",
+      "04/17/2024 23:16:08 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe0d14bcdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 23:16:10 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe0d14bcdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 1384/1384 [00:00<00:00, 25025.29 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 1369/1369 [00:00<00:00, 20654.61 exa\n",
+      "  0%|                                                    | 0/30 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.3216, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 3.164, 'learning_rate': 8e-07, 'epoch': 2.0}                           \n",
+      "{'loss': 3.0684, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 3.0369, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.9836, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.975, 'learning_rate': 4e-07, 'epoch': 6.0}                           \n",
+      "{'loss': 2.9718, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.9354, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.9501, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.9226, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 321.4382, 'train_samples_per_second': 42.59, 'train_steps_per_second': 0.093, 'train_loss': 3.0329485575358075, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████████| 30/30 [05:21<00:00, 10.71s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     3.0329\n",
+      "  train_runtime            = 0:05:21.43\n",
+      "  train_samples_per_second =      42.59\n",
+      "  train_steps_per_second   =      0.093\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold10\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 4, threshold > 0.90, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold1.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6f4a151362d146e08df5fd3919da8118",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0f8f82805ae34476ab8cf1a4603f72c0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fc82a54e23ee4ffa8fb0a4978678e437",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6c531147128d485c93e695ffc4d1d689",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/17/2024 23:21:37 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb4d2cf8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/17/2024 23:21:38 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb4d2cf8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 18411/18411 [00:00<00:00, 28710.80 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 18191/18191 [00:00<00:00, 21283.84 e\n",
+      "  0%|                                                   | 0/470 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1142, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9305, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8865, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8604, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8448, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.834, 'learning_rate': 4e-07, 'epoch': 6.0}                           \n",
+      "{'loss': 2.8264, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8198, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.815, 'learning_rate': 1e-07, 'epoch': 9.0}                           \n",
+      "{'loss': 2.8151, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4925.7397, 'train_samples_per_second': 36.93, 'train_steps_per_second': 0.095, 'train_loss': 2.874660280917553, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 470/470 [1:22:05<00:00, 10.48s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8747\n",
+      "  train_runtime            = 1:22:05.73\n",
+      "  train_samples_per_second =      36.93\n",
+      "  train_steps_per_second   =      0.095\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold1\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 4, threshold > 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold2.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f92e4eb759394314bd8620b78191f7cf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "da7932cfabeb4ea2ba79e2a375ab8297",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "676a8276b7e04acf8cd473b786ab4abe",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5f17f58759234877af8bff871542261d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/18/2024 00:43:51 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7faac3c70dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/18/2024 00:43:53 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7faac3c70dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 16919/16919 [00:00<00:00, 29294.57 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 16720/16720 [00:00<00:00, 21354.41 e\n",
+      "  0%|                                                   | 0/430 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1168, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.935, 'learning_rate': 8e-07, 'epoch': 2.0}                           \n",
+      "{'loss': 2.8881, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8599, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8448, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8351, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.825, 'learning_rate': 3e-07, 'epoch': 7.0}                           \n",
+      "{'loss': 2.8212, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8166, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8178, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4503.5748, 'train_samples_per_second': 37.126, 'train_steps_per_second': 0.095, 'train_loss': 2.8760146739871004, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 430/430 [1:15:03<00:00, 10.47s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =      2.876\n",
+      "  train_runtime            = 1:15:03.57\n",
+      "  train_samples_per_second =     37.126\n",
+      "  train_steps_per_second   =      0.095\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold2\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 4, threshold > 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold3.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7cd324457d4430da0fa7caf432058ff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8f4842e7ad67418992d44a3deda8702c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e4908df0f4de400282094d5d20433679",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cf0dcea8092d42fc94f46a2640c4f511",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/18/2024 01:59:03 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f9801398dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/18/2024 01:59:04 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f9801398dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 15445/15445 [00:00<00:00, 29273.29 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 15266/15266 [00:00<00:00, 21215.94 e\n",
+      "  0%|                                                   | 0/390 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1221, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.932, 'learning_rate': 8e-07, 'epoch': 2.0}                           \n",
+      "{'loss': 2.8869, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8632, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8437, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8344, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8296, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8189, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.82, 'learning_rate': 1e-07, 'epoch': 9.0}                            \n",
+      "{'loss': 2.8161, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4082.98, 'train_samples_per_second': 37.389, 'train_steps_per_second': 0.096, 'train_loss': 2.876693099584335, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 390/390 [1:08:02<00:00, 10.47s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8767\n",
+      "  train_runtime            = 1:08:02.98\n",
+      "  train_samples_per_second =     37.389\n",
+      "  train_steps_per_second   =      0.096\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold3\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 4, threshold > 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold4.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c2ca7462c8b84596af5273905bf5ec4a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7e98e073a5684183888edcad9f5683b2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ff3e5133f6ec43daba5ab39e5d8e7f45",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "17ac32120fec434789941bb57668338f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/18/2024 03:07:14 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fcd6064cee0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/18/2024 03:07:15 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fcd6064cee0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 13827/13827 [00:00<00:00, 29146.08 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 13667/13667 [00:00<00:00, 21224.77 e\n",
+      "  0%|                                                   | 0/350 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1325, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9345, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8899, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8631, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8451, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8378, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8306, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8225, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8218, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8222, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 3697.4589, 'train_samples_per_second': 36.963, 'train_steps_per_second': 0.095, 'train_loss': 2.879986855643136, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 350/350 [1:01:37<00:00, 10.56s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =       2.88\n",
+      "  train_runtime            = 1:01:37.45\n",
+      "  train_samples_per_second =     36.963\n",
+      "  train_steps_per_second   =      0.095\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold4\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 4, threshold > 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold5.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7b03dabaa81e4ca89266b36908751a4c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "675fcab7d3c74f2aa694af3ef7e8dc1d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8ba2c940e2ca45b880842aba57ce459c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fb8468eb5fbd4bb490b54d6a0a49e63d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/18/2024 04:08:59 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f5372268dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/18/2024 04:09:02 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f5372268dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 12231/12231 [00:00<00:00, 28084.34 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 12086/12086 [00:00<00:00, 20686.81 e\n",
+      "  0%|                                                   | 0/310 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1374, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9417, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.895, 'learning_rate': 7e-07, 'epoch': 3.0}                           \n",
+      "{'loss': 2.8706, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8572, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8445, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8376, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.832, 'learning_rate': 2e-07, 'epoch': 8.0}                           \n",
+      "{'loss': 2.8282, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8265, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 3282.538, 'train_samples_per_second': 36.819, 'train_steps_per_second': 0.094, 'train_loss': 2.8870725570186493, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 310/310 [54:42<00:00, 10.59s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8871\n",
+      "  train_runtime            = 0:54:42.53\n",
+      "  train_samples_per_second =     36.819\n",
+      "  train_steps_per_second   =      0.094\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold5\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 4, threshold > 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold6.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8f100fadedbf414e90eefbe02355fa8a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2f839600ad9c43fdbe53f4a0d594229d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "16d116d33a784b38a028e490992686b8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6d911fbc5b844906815109e9049678f3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/18/2024 05:03:52 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f1fb9b9cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/18/2024 05:03:53 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f1fb9b9cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 10517/10517 [00:00<00:00, 27957.56 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 10391/10391 [00:00<00:00, 20666.83 e\n",
+      "  0%|                                                   | 0/270 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1452, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9516, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9043, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8771, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.858, 'learning_rate': 5e-07, 'epoch': 5.0}                           \n",
+      "{'loss': 2.8475, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8414, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8391, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8288, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8312, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2859.8695, 'train_samples_per_second': 36.334, 'train_steps_per_second': 0.094, 'train_loss': 2.8924275433575666, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 270/270 [47:39<00:00, 10.59s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8924\n",
+      "  train_runtime            = 0:47:39.86\n",
+      "  train_samples_per_second =     36.334\n",
+      "  train_steps_per_second   =      0.094\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold6\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 4, threshold > 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold7.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d1f172fb13304819985fc1fbf1d89878",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ca9510287ed94e9a9906545b8c2259ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8df2433f0d5c4746a23f26ac9eb0cdaa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dc6a8fda341c42e698e357431d9bbda0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/18/2024 05:51:40 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f9227460dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/18/2024 05:51:41 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f9227460dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 8637/8637 [00:00<00:00, 28058.27 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 8538/8538 [00:00<00:00, 20734.63 exa\n",
+      "  0%|                                                   | 0/220 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1632, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9621, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9084, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8802, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8643, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8537, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8499, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.84, 'learning_rate': 2e-07, 'epoch': 8.0}                            \n",
+      "{'loss': 2.8424, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8346, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2343.3897, 'train_samples_per_second': 36.434, 'train_steps_per_second': 0.094, 'train_loss': 2.8998760743574663, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 220/220 [39:03<00:00, 10.65s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8999\n",
+      "  train_runtime            = 0:39:03.38\n",
+      "  train_samples_per_second =     36.434\n",
+      "  train_steps_per_second   =      0.094\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold7\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 4, threshold > 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold8.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f2b6cef4dcc84103ba104d3af2cb5c0b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "750fbabe7c3e4385ba615e62c0e55445",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ab03d7b5e32f4b43852d62d8b7fe0f07",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bad6c54488634ff5971a16747f1b54d7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/18/2024 06:30:51 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f2a408b8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/18/2024 06:30:52 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f2a408b8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 6555/6555 [00:00<00:00, 28287.52 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 6477/6477 [00:00<00:00, 20603.65 exa\n",
+      "  0%|                                                   | 0/160 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1816, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9807, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9273, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.9004, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8779, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8659, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8583, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8563, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.85, 'learning_rate': 1e-07, 'epoch': 9.0}                            \n",
+      "{'loss': 2.8515, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 1705.9834, 'train_samples_per_second': 37.966, 'train_steps_per_second': 0.094, 'train_loss': 2.914992904663086, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 160/160 [28:25<00:00, 10.66s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =      2.915\n",
+      "  train_runtime            = 0:28:25.98\n",
+      "  train_samples_per_second =     37.966\n",
+      "  train_steps_per_second   =      0.094\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold8\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 4, threshold > 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold9.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2fbcb2ea946c4d1999c8864a215233de",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9e7966fda8ef413397f4306eb59d3ac0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "506515c9aaa3419b9a4dc98499678dbb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b7263e120bc74f418c2b04c2cdf2142e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/18/2024 06:59:24 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f1ebcbacdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/18/2024 06:59:25 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f1ebcbacdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 4218/4218 [00:00<00:00, 27364.73 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 4179/4179 [00:00<00:00, 20736.71 exa\n",
+      "  0%|                                                   | 0/100 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.2364, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 3.0313, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9637, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.9227, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8959, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8852, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8723, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8654, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.866, 'learning_rate': 1e-07, 'epoch': 9.0}                           \n",
+      "{'loss': 2.8638, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 1074.2424, 'train_samples_per_second': 38.902, 'train_steps_per_second': 0.093, 'train_loss': 2.9402659225463865, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 100/100 [17:54<00:00, 10.74s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.9403\n",
+      "  train_runtime            = 0:17:54.24\n",
+      "  train_samples_per_second =     38.902\n",
+      "  train_steps_per_second   =      0.093\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold9\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 4, threshold > 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold10.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "65b69f5895424364916dc0b17d34bebf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f388313b272f41e2aa239d79600ca5a1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ea0cd2077a5945ca90963698e3d2d5b0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "68295ca078e44ea798df62cbf4f646b0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'\n",
+      "04/18/2024 07:17:25 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fc7c1f04dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "04/18/2024 07:17:26 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fc7c1f04dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 1435/1435 [00:00<00:00, 25075.73 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 1422/1422 [00:00<00:00, 20741.93 exa\n",
+      "  0%|                                                    | 0/30 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.3239, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 3.1556, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 3.0773, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 3.0223, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 3.0051, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.9641, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.9518, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.9332, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.9557, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.9272, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 323.8126, 'train_samples_per_second': 43.914, 'train_steps_per_second': 0.093, 'train_loss': 3.0316165606180827, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████████| 30/30 [05:23<00:00, 10.79s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     3.0316\n",
+      "  train_runtime            = 0:05:23.81\n",
+      "  train_samples_per_second =     43.914\n",
+      "  train_steps_per_second   =      0.093\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold10\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import transformers\n",
+    "from datasets import load_dataset\n",
+    "import pathlib\n",
+    "from typing import Generator\n",
+    "from collections import defaultdict\n",
+    "import sys\n",
+    "import json\n",
+    "import os\n",
+    "\n",
+    "# Assuming prepare_data_from_dataframe is defined elsewhere in your project\n",
+    "\n",
+    "# Fine-tune base model setup\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "transformers.utils.logging.set_verbosity_error()\n",
+    "\n",
+    "# Main training loop\n",
+    "for fold in range(5):  # Five folds\n",
+    "    for i, threshold in enumerate(np.arange(0.90, 1.00, 0.01), start=1):\n",
+    "        # Filename setup for training data at current threshold and fold\n",
+    "        train_json = f'train_fold_{fold}_thr_{int(threshold*100)}.json'\n",
+    "        test_json = f'val_fold_{fold}.json'  # Validation data for current fold\n",
+    "\n",
+    "        # Output directory for the trained model\n",
+    "        output_folder = f\"./workspace/output/laion-finetuned_v5e7_epoch10_fold{fold}_threshold{i}\"\n",
+    "        print(f\"Finetuning {repo_id} for fold {fold}, threshold > {threshold:.2f}, saving output to {output_folder}.\")\n",
+    "\n",
+    "        # Load dataset\n",
+    "        data_files = {'train': train_json, 'validation': test_json}\n",
+    "        dataset = load_dataset(\"json\", data_files=data_files)\n",
+    "        print(f\"First image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'\")\n",
+    "\n",
+    "        !python huggingface_finetune_clip.py \\\n",
+    "            --output_dir {output_folder} --model_name_or_path {repo_id} \\\n",
+    "            --train_file {train_json} \\\n",
+    "            --validation_file {test_json} \\\n",
+    "            --image_column image \\\n",
+    "            --overwrite_output_dir=True \\\n",
+    "            --max_seq_length=77 \\\n",
+    "            --num_train_epochs=10 \\\n",
+    "            --save_total_limit=5 \\\n",
+    "            --caption_column caption \\\n",
+    "            --remove_unused_columns=False \\\n",
+    "            --do_train \\\n",
+    "            --logging_strategy=\"epoch\"\\\n",
+    "            --per_device_train_batch_size=128 \\\n",
+    "            --dataloader_drop_last=True\\\n",
+    "            --learning_rate=\"1e-6\" --warmup_steps=\"0\" --weight_decay 0.1 \n",
+    "        print(f\"--\\nDONE. If it worked, trained data should be in {output_folder}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "511f7306",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold1...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/pipelines/base.py:1123: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model on fold 0 with threshold 0.91: 96.96%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold2...\n",
+      "Accuracy for model on fold 0 with threshold 0.92: 97.10%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold3...\n",
+      "Accuracy for model on fold 0 with threshold 0.93: 96.72%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold4...\n",
+      "Accuracy for model on fold 0 with threshold 0.94: 96.40%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold5...\n",
+      "Accuracy for model on fold 0 with threshold 0.95: 96.40%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold6...\n",
+      "Accuracy for model on fold 0 with threshold 0.96: 95.84%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold7...\n",
+      "Accuracy for model on fold 0 with threshold 0.97: 95.79%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold8...\n",
+      "Accuracy for model on fold 0 with threshold 0.98: 94.62%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold9...\n",
+      "Accuracy for model on fold 0 with threshold 0.99: 93.12%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold1...\n",
+      "Accuracy for model on fold 1 with threshold 0.91: 97.08%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold2...\n",
+      "Accuracy for model on fold 1 with threshold 0.92: 97.17%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold3...\n",
+      "Accuracy for model on fold 1 with threshold 0.93: 98.20%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold4...\n",
+      "Accuracy for model on fold 1 with threshold 0.94: 96.79%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold5...\n",
+      "Accuracy for model on fold 1 with threshold 0.95: 96.75%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold6...\n",
+      "Accuracy for model on fold 1 with threshold 0.96: 96.33%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold7...\n",
+      "Accuracy for model on fold 1 with threshold 0.97: 95.81%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold8...\n",
+      "Accuracy for model on fold 1 with threshold 0.98: 94.56%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold9...\n",
+      "Accuracy for model on fold 1 with threshold 0.99: 92.93%\n",
+      "\n",
+      "--\n",
+      "DONE\n",
+      "\n",
+      "All results saved to model_evaluation_results.csv\n",
+      "All detailed prediction results saved to detailed_predictions.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import matplotlib.pyplot as plt\n",
+    "from collections import Counter\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "def calculate_category_accuracy(true_labels, predicted_labels):\n",
+    "    \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n",
+    "    accuracies = {}\n",
+    "    true_labels_counter = Counter(true_labels)\n",
+    "    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n",
+    "    \n",
+    "    for label in true_labels_counter:\n",
+    "        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n",
+    "        accuracies[label] = accuracy\n",
+    "    \n",
+    "    return accuracies\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "results = []  # List to hold accuracy results\n",
+    "\n",
+    "# Loop for each fold and each threshold\n",
+    "for fold in range(2):\n",
+    "    for idx, threshold in enumerate(np.arange(0.91, 1.00, 0.01)):\n",
+    "        model_dir = f\"./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold{fold}_threshold{idx+1}\"\n",
+    "        test_json = f'val_fold_{fold}.json'  # Test JSON file for the current fold\n",
+    "        print(f\"Evaluating with model from {model_dir}...\")\n",
+    "\n",
+    "        # Load the JSON data for testing\n",
+    "        with open(test_json, 'r') as f:\n",
+    "            data = [json.loads(line) for line in f]\n",
+    "\n",
+    "        # Extract image paths and labels\n",
+    "        image_paths = [item['image'] for item in data]\n",
+    "        labels = [item['caption'] for item in data]\n",
+    "\n",
+    "        # Initialize model components\n",
+    "        image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "        tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "        model = AutoModel.from_pretrained(model_dir)\n",
+    "        clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                                 device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(model_dir))\n",
+    "\n",
+    "        all_predictions = []\n",
+    "        all_true_labels = []\n",
+    "\n",
+    "        # Process images in batches\n",
+    "        for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "            batch_images = []\n",
+    "            valid_labels = []\n",
+    "            for path, label in zip(batch_paths, batch_labels):\n",
+    "                try:\n",
+    "                    batch_images.append(Image.open(path))\n",
+    "                    valid_labels.append(label)\n",
+    "                except (FileNotFoundError, UnidentifiedImageError):\n",
+    "                    continue  # Skip images that cannot be opened\n",
+    "\n",
+    "            # Get predictions for the batch of images\n",
+    "            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "            all_predictions.extend(predicted_labels)\n",
+    "            all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "\n",
+    "        # Calculate accuracy\n",
+    "        correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "        accuracy = correct_predictions / len(all_true_labels)\n",
+    "        print(f\"Accuracy for model on fold {fold} with threshold {threshold:.2f}: {accuracy * 100:.2f}%\\n\")\n",
+    "\n",
+    "        # Calculate the accuracy for each category\n",
+    "        category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n",
+    "        for category, acc in category_accuracies.items():\n",
+    "            results.append({\n",
+    "                'Fold': fold,\n",
+    "                'Threshold': f\">{threshold:.2f}\",\n",
+    "                'Category': category,\n",
+    "                'Accuracy': acc\n",
+    "            })\n",
+    "        print(\"--\\nDONE\\n\")\n",
+    "\n",
+    "# Create DataFrame from results and save to CSV\n",
+    "results_df = pd.DataFrame(results)\n",
+    "results_df.to_csv('./result/ours_results.csv', index=False)\n",
+    "print(\"All results saved to model_evaluation_results.csv\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "17b5b940",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain1...\n",
+      "Accuracy for model 1: 97.08%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain2...\n",
+      "Accuracy for model 2: 97.17%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain3...\n",
+      "Accuracy for model 3: 98.20%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain4...\n",
+      "Accuracy for model 4: 96.79%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain5...\n",
+      "Accuracy for model 5: 96.75%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain6...\n",
+      "Accuracy for model 6: 96.33%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain7...\n",
+      "Accuracy for model 7: 95.81%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain8...\n",
+      "Accuracy for model 8: 94.56%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain9...\n",
+      "Accuracy for model 9: 92.93%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import matplotlib.pyplot as plt\n",
+    "from collections import Counter\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "def calculate_category_accuracy(true_labels, predicted_labels):\n",
+    "    \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n",
+    "    accuracies = {}\n",
+    "    true_labels_counter = Counter(true_labels)\n",
+    "    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n",
+    "    \n",
+    "    for label in true_labels_counter:\n",
+    "        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n",
+    "        accuracies[label] = accuracy\n",
+    "    \n",
+    "    return accuracies\n",
+    "\n",
+    "BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "results = []\n",
+    "\n",
+    "# Loop for each fold and each threshold\n",
+    "for fold in range(5):\n",
+    "    for threshold in np.arange(0.90, 1.00, 0.01):\n",
+    "        model_dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_fold{fold}_threshold{int(threshold*100)}\"\n",
+    "        test_json = f'val_fold_{fold}.json'  # Test JSON file for the current fold\n",
+    "        print(f\"Evaluating with model from {model_dir}...\")\n",
+    "\n",
+    "        # Load the JSON data for testing\n",
+    "        with open(test_json, 'r') as f:\n",
+    "            data = [json.loads(line) for line in f]\n",
+    "\n",
+    "        # Extract image paths and labels\n",
+    "        image_paths = [item['image'] for item in data]\n",
+    "        labels = [item['caption'] for item in data]\n",
+    "\n",
+    "        # Initialize model components\n",
+    "        image_processor = AutoImageProcessor.from_pretrained(model_dir)\n",
+    "        tokenizer = AutoTokenizer.from_pretrained(model_dir, config=AutoConfig.from_pretrained(model_dir))\n",
+    "        model = AutoModel.from_pretrained(model_dir)\n",
+    "        clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                                 device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(model_dir))\n",
+    "\n",
+    "        all_predictions = []\n",
+    "        all_true_labels = []\n",
+    "\n",
+    "        # Process images in batches\n",
+    "        for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "            batch_images = []\n",
+    "            valid_labels = []\n",
+    "            for path, label in zip(batch_paths, batch_labels):\n",
+    "                try:\n",
+    "                    batch_images.append(Image.open(path))\n",
+    "                    valid_labels.append(label)\n",
+    "                except (FileNotFoundError, UnidentifiedImageError):\n",
+    "                    continue  # Skip images that cannot be opened\n",
+    "\n",
+    "            # Get predictions for the batch of images\n",
+    "            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "            all_predictions.extend(predicted_labels)\n",
+    "            all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "        # Calculate accuracy\n",
+    "        correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "        accuracy = correct_predictions / len(all_true_labels)\n",
+    "        print(f\"Accuracy for model on fold {fold} with threshold {threshold:.2f}: {accuracy * 100:.2f}%\\n\")\n",
+    "\n",
+    "        # Calculate the accuracy for each category\n",
+    "        category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n",
+    "        for category, acc in category_accuracies.items():\n",
+    "            results.append({\n",
+    "                'Fold': fold,\n",
+    "                'Threshold': f\">{threshold:.2f}\",\n",
+    "                'Category': category,\n",
+    "                'Accuracy': acc\n",
+    "            })\n",
+    "        print(\"--\\nDONE\\n\")\n",
+    "\n",
+    "# Create DataFrame from results and save to CSV\n",
+    "results_df = pd.DataFrame(results)\n",
+    "results_df.to_csv('./result/ours_results.csv', index=False)\n",
+    "print(\"All results saved to model_evaluation_results.csv\")\n",
+    "\n",
+    "# change to org test and add all predict details into df, future may generate con-mat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ff6b001a",
+   "metadata": {},
+   "source": [
+    "# read data directly from csv 0.95 as test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "a72455d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import json\n",
+    "\n",
+    "def prepare_data_from_dataframe(df: pd.DataFrame, output_file: str) -> str:\n",
+    "    \"\"\"Prepare data for finetuning by reading from a DataFrame.\"\"\"\n",
+    "    data = []\n",
+    "\n",
+    "    # Process data\n",
+    "    for _, row in df.iterrows():\n",
+    "        image_path = f\"../data/{row['max_key']}/{row['path'].split('/')[-1]}\"\n",
+    "        data.append({\"image\": image_path, \"caption\": row['max_key']})\n",
+    "\n",
+    "    # Save the data in JSON format\n",
+    "    with open(output_file, \"w\") as f:\n",
+    "        for item in data:\n",
+    "            json.dump(item, f)\n",
+    "            f.write(\"\\n\")\n",
+    "\n",
+    "    return output_file\n",
+    "\n",
+    "\n",
+    "# Load the CSV file\n",
+    "df = pd.read_csv('labels.csv')\n",
+    "df = df[df['max_key'] != 'error']\n",
+    "df = df[df['max_key'] != 'a photo of other indoor space: not kitchen, not bathroom, not living room, not dining room, not foyer']\n",
+    "df = df[df['max_key'] != 'it is a artificial photo']\n",
+    "df = df[df['max_key'] != 'a photo of outdoor space']\n",
+    "\n",
+    "# Filter samples with max_value > 0.9\n",
+    "test_df = df[df['max_value'] == 0.95]\n",
+    "threshold_df = df[(df['max_value'] > 0.9) & (~df.index.isin(test_df.index))]\n",
+    "\n",
+    "\n",
+    "test_json = prepare_data_from_dataframe(test_df, 'val.json')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "5102624a",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<>:11: SyntaxWarning: \"is\" with a literal. Did you mean \"==\"?\n",
+      "<>:11: SyntaxWarning: \"is\" with a literal. Did you mean \"==\"?\n",
+      "/tmp/ipykernel_3654472/3070016425.py:11: SyntaxWarning: \"is\" with a literal. Did you mean \"==\"?\n",
+      "  if threshold is 0.95:\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for threshold > 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3e501fe53c074a57b1aa32c25bdaf992",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f77ae45295d4440d9b7353258b54d737",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d0acc4c7dcd94c429fe3790d4293d872",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fb25372480d743dd8597ef9a26d49f64",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'\n",
+      "03/22/2024 11:54:48 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7feef8dc4d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "03/22/2024 11:54:49 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7feef8dc4d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 19011/19011 [00:00<00:00, 28447.70 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 18786/18786 [00:00<00:00, 20427.73 e\n",
+      "  0%|                                                   | 0/480 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.097, 'learning_rate': 9e-07, 'epoch': 1.0}                           \n",
+      "{'loss': 2.9239, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8788, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.857, 'learning_rate': 6e-07, 'epoch': 4.0}                           \n",
+      "{'loss': 2.8415, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8333, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8244, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8222, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8151, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8155, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 5072.1406, 'train_samples_per_second': 37.038, 'train_steps_per_second': 0.095, 'train_loss': 2.870869000752767, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 480/480 [1:24:32<00:00, 10.57s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8709\n",
+      "  train_runtime            = 1:24:32.14\n",
+      "  train_samples_per_second =     37.038\n",
+      "  train_steps_per_second   =      0.095\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for threshold > 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "79dc32649f2546aabe5e2aff055467fa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3fa3be41109740b89a1ad0702ee9f145",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5126698c0fa742b98e04d7f99150eac2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4f6ee6c738864243a0ee6e6861bd9dbc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'\n",
+      "03/22/2024 13:19:29 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f6f17050d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "03/22/2024 13:19:31 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f6f17050d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 17132/17132 [00:00<00:00, 28605.44 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 16929/16929 [00:00<00:00, 21019.66 e\n",
+      "  0%|                                                   | 0/440 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1073, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9212, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8777, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8543, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8421, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8321, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8247, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8185, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8175, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8149, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4634.8222, 'train_samples_per_second': 36.526, 'train_steps_per_second': 0.095, 'train_loss': 2.87104417627508, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 440/440 [1:17:14<00:00, 10.53s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =      2.871\n",
+      "  train_runtime            = 1:17:14.82\n",
+      "  train_samples_per_second =     36.526\n",
+      "  train_steps_per_second   =      0.095\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for threshold > 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a644eaf4ab2d404d9868baa49668d2d6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9e0d64c2a65a491b8d3d24de4bedba6e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c64f0a3e81114187be53c7e1719a5484",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b5a51d7d46d742b3b4ba04690a6e5a07",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'\n",
+      "03/22/2024 14:36:53 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f245c0a4d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "03/22/2024 14:36:54 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f245c0a4d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 15134/15134 [00:00<00:00, 28682.00 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 14956/14956 [00:00<00:00, 20926.65 e\n",
+      "  0%|                                                   | 0/380 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.104, 'learning_rate': 9e-07, 'epoch': 1.0}                           \n",
+      "{'loss': 2.9244, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8833, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8568, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8425, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8339, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.828, 'learning_rate': 3e-07, 'epoch': 7.0}                           \n",
+      "{'loss': 2.8239, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8201, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8187, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 4008.0625, 'train_samples_per_second': 37.315, 'train_steps_per_second': 0.095, 'train_loss': 2.873565091584858, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████| 380/380 [1:06:48<00:00, 10.55s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8736\n",
+      "  train_runtime            = 1:06:48.06\n",
+      "  train_samples_per_second =     37.315\n",
+      "  train_steps_per_second   =      0.095\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for threshold > 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a4d09a8ec1c04a0fb7c96183298b3033",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d501f7b184e5495f8fbd21eff2a5b1ec",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c416ef9f9fe04b338e20c918880f07c0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "973e26d607da405999f8e08dbcc37ad4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'\n",
+      "03/22/2024 15:43:49 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe886510d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "03/22/2024 15:43:50 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fe886510d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 13150/13150 [00:00<00:00, 28607.09 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 12995/12995 [00:00<00:00, 20773.55 e\n",
+      "  0%|                                                   | 0/330 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1277, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9397, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8939, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8678, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8549, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8444, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8377, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8329, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8337, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8273, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 3506.2512, 'train_samples_per_second': 37.062, 'train_steps_per_second': 0.094, 'train_loss': 2.88599282467004, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 330/330 [58:26<00:00, 10.62s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =      2.886\n",
+      "  train_runtime            = 0:58:26.25\n",
+      "  train_samples_per_second =     37.062\n",
+      "  train_steps_per_second   =      0.094\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for threshold > 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2681834cdcfc415583a4608cc9137d36",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1ff027a0c6f54512a6d63504d5e0b86f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a59cedde86384ae0870ff22269db527a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a20f917ab68a4a6facfdeefe01f2bf37",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'\n",
+      "03/22/2024 16:42:24 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f11b826cd30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "03/22/2024 16:42:25 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f11b826cd30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 13150/13150 [00:00<00:00, 28708.52 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 12995/12995 [00:00<00:00, 20854.43 e\n",
+      "  0%|                                                   | 0/330 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1277, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9397, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.8939, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.8678, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8549, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8444, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8377, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8329, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8337, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8273, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 3564.284, 'train_samples_per_second': 36.459, 'train_steps_per_second': 0.093, 'train_loss': 2.88599282467004, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 330/330 [59:24<00:00, 10.80s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =      2.886\n",
+      "  train_runtime            = 0:59:24.28\n",
+      "  train_samples_per_second =     36.459\n",
+      "  train_steps_per_second   =      0.093\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for threshold > 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "34e2973f4e48490da57038a3fa3fe5eb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6b05a162fe9a44d4bf57aaa14fbba570",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ae77a6044ab94e7c9d8b6d722094ab8f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1ca43a656da44c7daee39d4bbdd23a9b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'\n",
+      "03/22/2024 17:41:57 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb0cc32cd30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "03/22/2024 17:41:58 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb0cc32cd30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|███████████████████| 10782/10782 [00:00<00:00, 28008.46 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 10658/10658 [00:00<00:00, 20642.58 e\n",
+      "  0%|                                                   | 0/270 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1348, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9474, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.896, 'learning_rate': 7e-07, 'epoch': 3.0}                           \n",
+      "{'loss': 2.8752, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.86, 'learning_rate': 5e-07, 'epoch': 5.0}                            \n",
+      "{'loss': 2.8464, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.84, 'learning_rate': 3e-07, 'epoch': 7.0}                            \n",
+      "{'loss': 2.8364, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8359, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8342, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2901.941, 'train_samples_per_second': 36.727, 'train_steps_per_second': 0.093, 'train_loss': 2.890633703161169, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 270/270 [48:21<00:00, 10.75s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.8906\n",
+      "  train_runtime            = 0:48:21.94\n",
+      "  train_samples_per_second =     36.727\n",
+      "  train_steps_per_second   =      0.093\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for threshold > 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0fa31178a7ea4a37a0a2ac2b95259224",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "519f38e7d8474fe7b9d3eb519cda1136",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a4a5ceb7e5274584a32cd3e8c121fd4c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3dc369b537824b878135ea0d1440be4a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'\n",
+      "03/22/2024 18:30:27 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb3af730d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "03/22/2024 18:30:28 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7fb3af730d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 8144/8144 [00:00<00:00, 27990.81 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 8047/8047 [00:00<00:00, 20525.77 exa\n",
+      "  0%|                                                   | 0/200 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1559, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9624, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9137, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.887, 'learning_rate': 6e-07, 'epoch': 4.0}                           \n",
+      "{'loss': 2.8703, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8645, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8537, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8523, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8464, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8405, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 2165.6221, 'train_samples_per_second': 37.158, 'train_steps_per_second': 0.092, 'train_loss': 2.9046568870544434, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 200/200 [36:05<00:00, 10.83s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.9047\n",
+      "  train_runtime            = 0:36:05.62\n",
+      "  train_samples_per_second =     37.158\n",
+      "  train_steps_per_second   =      0.092\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for threshold > 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ad633b6ee9284399a1cf4e0388677a51",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "50c2eb2b33ed4f968e5ced363def3341",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "85800c6248b549e9b5d0f3d36a9668a0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "65b9e9544de64fae9b1a4d301b518be4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'\n",
+      "03/22/2024 19:06:42 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f3a009b0d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "03/22/2024 19:06:43 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f3a009b0d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 5197/5197 [00:00<00:00, 27343.36 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 5150/5150 [00:00<00:00, 20052.73 exa\n",
+      "  0%|                                                   | 0/130 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.1931, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 2.9951, 'learning_rate': 8e-07, 'epoch': 2.0}                          \n",
+      "{'loss': 2.9409, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.9079, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.8858, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.8681, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.8644, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.8607, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.8568, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.8564, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 1414.5591, 'train_samples_per_second': 36.407, 'train_steps_per_second': 0.092, 'train_loss': 2.9229195814866284, 'epoch': 10.0}\n",
+      "100%|█████████████████████████████████████████| 130/130 [23:34<00:00, 10.88s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     2.9229\n",
+      "  train_runtime            = 0:23:34.55\n",
+      "  train_samples_per_second =     36.407\n",
+      "  train_steps_per_second   =      0.092\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8\n",
+      "\n",
+      "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for threshold > 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain9.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1227eb23eba3473daa795fad84ad48d5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ba65a1e1290c4bfda083dce1adc71092",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a2a1f4f26f1e4603b7b2218bc80a8b32",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f17541babda240988eb36f0b7a48abc7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'\n",
+      "03/22/2024 19:30:25 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 3distributed training: True, 16-bits training: False\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
+      "You can remove this warning by passing 'token=None' instead.\n",
+      "  warnings.warn(\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n",
+      "  warnings.warn(\n",
+      "Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f0abc714d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "03/22/2024 19:30:26 - WARNING - datasets.fingerprint - Parameter 'function'=<function main.<locals>.filter_corrupt_images at 0x7f0abc714d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "Filter: 100%|█████████████████████| 1731/1731 [00:00<00:00, 25087.04 examples/s]\n",
+      "Running tokenizer on train dataset: 100%|█| 1715/1715 [00:00<00:00, 20083.96 exa\n",
+      "  0%|                                                    | 0/40 [00:00<?, ?it/s]/home/haojin/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1501: UserWarning: operator() profile_node %611 : int = prim::profile_ivalue(%dtype)\n",
+      " does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)\n",
+      "  return forward_call(*args, **kwargs)\n",
+      "/home/haojin/.local/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
+      "{'loss': 3.2934, 'learning_rate': 9e-07, 'epoch': 1.0}                          \n",
+      "{'loss': 3.093, 'learning_rate': 8e-07, 'epoch': 2.0}                           \n",
+      "{'loss': 3.0367, 'learning_rate': 7e-07, 'epoch': 3.0}                          \n",
+      "{'loss': 2.9903, 'learning_rate': 6e-07, 'epoch': 4.0}                          \n",
+      "{'loss': 2.9514, 'learning_rate': 5e-07, 'epoch': 5.0}                          \n",
+      "{'loss': 2.9515, 'learning_rate': 4e-07, 'epoch': 6.0}                          \n",
+      "{'loss': 2.9336, 'learning_rate': 3e-07, 'epoch': 7.0}                          \n",
+      "{'loss': 2.9374, 'learning_rate': 2e-07, 'epoch': 8.0}                          \n",
+      "{'loss': 2.9119, 'learning_rate': 1e-07, 'epoch': 9.0}                          \n",
+      "{'loss': 2.9121, 'learning_rate': 0.0, 'epoch': 10.0}                           \n",
+      "{'train_runtime': 438.4849, 'train_samples_per_second': 39.112, 'train_steps_per_second': 0.091, 'train_loss': 3.0011263370513914, 'epoch': 10.0}\n",
+      "100%|███████████████████████████████████████████| 40/40 [07:18<00:00, 10.96s/it]\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       10.0\n",
+      "  train_loss               =     3.0011\n",
+      "  train_runtime            = 0:07:18.48\n",
+      "  train_samples_per_second =     39.112\n",
+      "  train_steps_per_second   =      0.091\n",
+      "--\n",
+      "DONE. If it worked, trained data should be in ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain9\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "# Finetune base model\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "batch_size = 64\n",
+    "num_train_epochs = 100\n",
+    "\n",
+    "transformers.utils.logging.set_verbosity_error()\n",
+    "\n",
+    "# Loop for different thresholds\n",
+    "for i, threshold in enumerate(np.arange(0.91, 1.0, 0.01), start=1):\n",
+    "    if threshold is 0.95:\n",
+    "        continue\n",
+    "    threshold_df = df[df['max_value'] > threshold]\n",
+    "    threshold_df = threshold_df[threshold_df['max_value'] != 0.95]\n",
+    "    train_json = f'train{i}.json'\n",
+    "    prepare_data_from_dataframe(threshold_df, train_json)\n",
+    "\n",
+    "    output_folder = f\"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}\"\n",
+    "    print(f\"Finetuning {repo_id} for threshold > {threshold:.2f}, saving output to {output_folder}.\")\n",
+    "    data_files = {'train': train_json, 'validation': test_json}\n",
+    "    dataset = load_dataset(\"json\", data_files=data_files)\n",
+    "    print(f\"first image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'\")\n",
+    "\n",
+    "    !python huggingface_finetune_clip.py \\\n",
+    "        --output_dir {output_folder} --model_name_or_path {repo_id} \\\n",
+    "        --train_file {train_json} \\\n",
+    "        --validation_file {test_json} \\\n",
+    "        --image_column image \\\n",
+    "        --overwrite_output_dir=True \\\n",
+    "        --max_seq_length=77 \\\n",
+    "        --num_train_epochs=10 \\\n",
+    "        --save_total_limit=5 \\\n",
+    "        --caption_column caption \\\n",
+    "        --remove_unused_columns=False \\\n",
+    "        --do_train \\\n",
+    "        --logging_strategy=\"epoch\"\\\n",
+    "        --per_device_train_batch_size=128 \\\n",
+    "        --dataloader_drop_last=True\\\n",
+    "        --learning_rate=\"1e-6\" --warmup_steps=\"0\" --weight_decay 0.1 \n",
+    "    print(f\"--\\nDONE. If it worked, trained data should be in {output_folder}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "68fd9c3a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/pipelines/base.py:1123: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 1: 96.96%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2...\n",
+      "Accuracy for model 2: 97.10%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3...\n",
+      "Accuracy for model 3: 96.72%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4...\n",
+      "Accuracy for model 4: 96.40%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5...\n",
+      "Accuracy for model 5: 96.40%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6...\n",
+      "Accuracy for model 6: 95.84%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7...\n",
+      "Accuracy for model 7: 95.79%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8...\n",
+      "Accuracy for model 8: 94.62%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain9...\n",
+      "Accuracy for model 9: 93.12%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 95\n",
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "# Loop for different finetuned models\n",
+    "for i in range(1, 10):  # Assuming you have 9 finetuned models\n",
+    "    dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}\"\n",
+    "    print(f\"Evaluating with model from {dir}...\")\n",
+    "\n",
+    "    image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "    model = AutoModel.from_pretrained(dir)\n",
+    "    clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                             device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))\n",
+    "\n",
+    "    all_predictions = []\n",
+    "    all_true_labels = []\n",
+    "\n",
+    "    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "        batch_images = []\n",
+    "        valid_labels = []\n",
+    "        for path, label in zip(batch_paths, batch_labels):\n",
+    "            try:\n",
+    "                batch_images.append(Image.open(path))\n",
+    "                valid_labels.append(label)\n",
+    "            except (FileNotFoundError, UnidentifiedImageError):\n",
+    "                continue  # Skip images that cannot be opened\n",
+    "\n",
+    "        # Get predictions for the batch of images\n",
+    "        predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "        predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "        all_predictions.extend(predicted_labels)\n",
+    "        all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "    accuracy = correct_predictions / len(all_true_labels)\n",
+    "    print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "a363651a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 9: 83.15%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "\n",
+    "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = AutoModel.from_pretrained(repo_id)\n",
+    "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "\n",
+    "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "    batch_images = []\n",
+    "    valid_labels = []\n",
+    "    for path, label in zip(batch_paths, batch_labels):\n",
+    "        try:\n",
+    "            batch_images.append(Image.open(path))\n",
+    "            valid_labels.append(label)\n",
+    "        except (FileNotFoundError, UnidentifiedImageError):\n",
+    "            continue  # Skip images that cannot be opened\n",
+    "\n",
+    "    # Get predictions for the batch of images\n",
+    "    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "    all_predictions.extend(predicted_labels)\n",
+    "    all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "accuracy = correct_predictions / len(all_true_labels)\n",
+    "print(f\"Accuracy for model org: {accuracy * 100:.2f}%\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "888c35be",
+   "metadata": {},
+   "source": [
+    "# test on five room task"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "5c21aea1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ee89e02ab2b64a0d9b80920bd80c9d89",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ee40732c911c45058364011932264ba6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a38b9b5764d4465a93a9d78f631bc1d5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8939464faa10475789e60ca604cd58e4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first image: ../room_5/a photo of kitchen/gsun_1c5c2085ba8b7d4a176739aab0998c8d.jpg, caption: 'a photo of kitchen'\n",
+      "/home/haojin/anaconda3/envs/huggingface/bin/python\n",
+      "/home/haojin/anaconda3/envs/huggingface/bin/pip3\r\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['image', 'caption'],\n",
+       "    num_rows: 102625\n",
+       "})"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import os\n",
+    "import pathlib\n",
+    "from typing import Generator\n",
+    "from collections import defaultdict\n",
+    "import datasets\n",
+    "from datasets import load_dataset\n",
+    "import sys\n",
+    "\n",
+    "def collect_images_from_directory(directory: str) -> dict:\n",
+    "    \"\"\"Collect images from a specified directory and group them by label.\"\"\"\n",
+    "    images_per_label = defaultdict(list)\n",
+    "    \n",
+    "    subfolders = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]\n",
+    "    for subfolder in subfolders:\n",
+    "        label = subfolder\n",
+    "        subfolder_path = os.path.join(directory, subfolder)\n",
+    "        for filename in os.listdir(subfolder_path):\n",
+    "            if filename.endswith('.jpg') or filename.endswith('.png'):\n",
+    "                images_per_label[label].append(os.path.join(subfolder_path, filename))\n",
+    "                \n",
+    "    return images_per_label\n",
+    "\n",
+    "def prepare_data_for_finetuning(train_dir: str, test_dir: str) -> tuple:\n",
+    "    \"\"\"Prepare data for finetuning by reading images from specified train and test directories.\"\"\"\n",
+    "    train_data = []\n",
+    "    val_data = []\n",
+    "\n",
+    "    # Collect training images\n",
+    "    train_images_per_label = collect_images_from_directory(train_dir)\n",
+    "    for label, images in train_images_per_label.items():\n",
+    "        train_data.extend([{\"image\": img, \"caption\": label} for img in images])\n",
+    "\n",
+    "    # Collect testing images\n",
+    "    test_images_per_label = collect_images_from_directory(test_dir)\n",
+    "    for label, images in test_images_per_label.items():\n",
+    "        val_data.extend([{\"image\": img, \"caption\": label} for img in images])\n",
+    "\n",
+    "    # Save the data in JSON format in the code directory\n",
+    "    train_file = \"room_train.json\"\n",
+    "    val_file = \"room_val.json\"\n",
+    "    with open(train_file, \"w\") as f:\n",
+    "        for item in train_data:\n",
+    "            json.dump(item, f)\n",
+    "            f.write(\"\\n\")\n",
+    "    with open(val_file, \"w\") as f:\n",
+    "        for item in val_data:\n",
+    "            json.dump(item, f)\n",
+    "            f.write(\"\\n\")\n",
+    "    \n",
+    "    return train_file, val_file\n",
+    "\n",
+    "# Usage:\n",
+    "train_json, test_json = prepare_data_for_finetuning(\"../room_5\", \"../room_5\")\n",
+    "data_files = {'train': train_json, 'validation': test_json}\n",
+    "\n",
+    "# test loading it back in\n",
+    "\n",
+    "dataset = load_dataset(\"json\", data_files=data_files)\n",
+    "print(f\"first image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'\")\n",
+    "\n",
+    "print(sys.executable)\n",
+    "!which pip3\n",
+    "dataset['validation']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "e5ce841f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model: 99.97%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# test on original ckpt\n",
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('room_val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "\n",
+    "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = AutoModel.from_pretrained(repo_id)\n",
+    "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "\n",
+    "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "    batch_images = []\n",
+    "    valid_labels = []\n",
+    "    for path, label in zip(batch_paths, batch_labels):\n",
+    "        try:\n",
+    "            batch_images.append(Image.open(path))\n",
+    "            valid_labels.append(label)\n",
+    "        except (FileNotFoundError, UnidentifiedImageError):\n",
+    "            continue  # Skip images that cannot be opened\n",
+    "\n",
+    "    # Get predictions for the batch of images\n",
+    "    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "    all_predictions.extend(predicted_labels)\n",
+    "    all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "accuracy = correct_predictions / len(all_true_labels)\n",
+    "print(f\"Accuracy for model: {accuracy * 100:.2f}%\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "fb17bda0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1...\n",
+      "Accuracy for model 1: 99.96%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2...\n",
+      "Accuracy for model 2: 99.96%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3...\n",
+      "Accuracy for model 3: 99.96%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4...\n",
+      "Accuracy for model 4: 99.96%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5...\n",
+      "Accuracy for model 5: 99.96%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6...\n",
+      "Accuracy for model 6: 99.96%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7...\n",
+      "Accuracy for model 7: 99.97%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8...\n",
+      "Accuracy for model 8: 99.97%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain9...\n",
+      "Accuracy for model 9: 99.97%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('room_val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "# Loop for different finetuned models\n",
+    "for i in range(1, 10):  # Assuming you have 9 finetuned models\n",
+    "    dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}\"\n",
+    "    print(f\"Evaluating with model from {dir}...\")\n",
+    "\n",
+    "    image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "    model = AutoModel.from_pretrained(dir)\n",
+    "    clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                             device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))\n",
+    "\n",
+    "    all_predictions = []\n",
+    "    all_true_labels = []\n",
+    "\n",
+    "    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "        batch_images = []\n",
+    "        valid_labels = []\n",
+    "        for path, label in zip(batch_paths, batch_labels):\n",
+    "            try:\n",
+    "                batch_images.append(Image.open(path))\n",
+    "                valid_labels.append(label)\n",
+    "            except (FileNotFoundError, UnidentifiedImageError):\n",
+    "                continue  # Skip images that cannot be opened\n",
+    "\n",
+    "        # Get predictions for the batch of images\n",
+    "        predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "        predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "        all_predictions.extend(predicted_labels)\n",
+    "        all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "    accuracy = correct_predictions / len(all_true_labels)\n",
+    "    print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5952751e",
+   "metadata": {},
+   "source": [
+    "# test on orginal dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0e4e3088",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "83c174061bcb48929db0ce2f479221a4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d68013a6b35746a8b0d88ec768cf129f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "37709a5f432a4c6b862e503f4857d4c2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8695a303f8e94e3bbcf1309710d309f4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first image: ../class_4/a photo of standard bathroom/IMG-C5472473_15.jpg, caption: 'a photo of standard bathroom'\n",
+      "/home/haojin/anaconda3/envs/huggingface/bin/python\n",
+      "/home/haojin/anaconda3/envs/huggingface/bin/pip3\r\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['image', 'caption'],\n",
+       "    num_rows: 752\n",
+       "})"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import os\n",
+    "import pathlib\n",
+    "from typing import Generator\n",
+    "from collections import defaultdict\n",
+    "import datasets\n",
+    "from datasets import load_dataset\n",
+    "import sys\n",
+    "\n",
+    "def collect_images_from_directory(directory: str) -> dict:\n",
+    "    \"\"\"Collect images from a specified directory and group them by label.\"\"\"\n",
+    "    images_per_label = defaultdict(list)\n",
+    "    \n",
+    "    subfolders = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]\n",
+    "    for subfolder in subfolders:\n",
+    "        label = subfolder\n",
+    "        subfolder_path = os.path.join(directory, subfolder)\n",
+    "        for filename in os.listdir(subfolder_path):\n",
+    "            if filename.endswith('.jpg') or filename.endswith('.png'):\n",
+    "                images_per_label[label].append(os.path.join(subfolder_path, filename))\n",
+    "                \n",
+    "    return images_per_label\n",
+    "\n",
+    "def prepare_data_for_finetuning(train_dir: str, test_dir: str) -> tuple:\n",
+    "    \"\"\"Prepare data for finetuning by reading images from specified train and test directories.\"\"\"\n",
+    "    train_data = []\n",
+    "    val_data = []\n",
+    "\n",
+    "    # Collect training images\n",
+    "    train_images_per_label = collect_images_from_directory(train_dir)\n",
+    "    for label, images in train_images_per_label.items():\n",
+    "        train_data.extend([{\"image\": img, \"caption\": label} for img in images])\n",
+    "\n",
+    "    # Collect testing images\n",
+    "    test_images_per_label = collect_images_from_directory(test_dir)\n",
+    "    for label, images in test_images_per_label.items():\n",
+    "        val_data.extend([{\"image\": img, \"caption\": label} for img in images])\n",
+    "\n",
+    "    # Save the data in JSON format in the code directory\n",
+    "    train_file = \"kb_train.json\"\n",
+    "    val_file = \"kb_val.json\"\n",
+    "    with open(train_file, \"w\") as f:\n",
+    "        for item in train_data:\n",
+    "            json.dump(item, f)\n",
+    "            f.write(\"\\n\")\n",
+    "    with open(val_file, \"w\") as f:\n",
+    "        for item in val_data:\n",
+    "            json.dump(item, f)\n",
+    "            f.write(\"\\n\")\n",
+    "    \n",
+    "    return train_file, val_file\n",
+    "\n",
+    "# Usage:\n",
+    "train_json, test_json = prepare_data_for_finetuning(\"../class_4\", \"../class_4\")\n",
+    "data_files = {'train': train_json, 'validation': test_json}\n",
+    "\n",
+    "# test loading it back in\n",
+    "\n",
+    "dataset = load_dataset(\"json\", data_files=data_files)\n",
+    "print(f\"first image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'\")\n",
+    "\n",
+    "print(sys.executable)\n",
+    "!which pip3\n",
+    "dataset['validation']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "115d5794",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model: 99.87%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# test on original ckpt\n",
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('kb_val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "\n",
+    "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = AutoModel.from_pretrained(repo_id)\n",
+    "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "\n",
+    "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "    batch_images = []\n",
+    "    valid_labels = []\n",
+    "    for path, label in zip(batch_paths, batch_labels):\n",
+    "        try:\n",
+    "            batch_images.append(Image.open(path))\n",
+    "            valid_labels.append(label)\n",
+    "        except (FileNotFoundError, UnidentifiedImageError):\n",
+    "            continue  # Skip images that cannot be opened\n",
+    "\n",
+    "    # Get predictions for the batch of images\n",
+    "    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "    all_predictions.extend(predicted_labels)\n",
+    "    all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "accuracy = correct_predictions / len(all_true_labels)\n",
+    "print(f\"Accuracy for model: {accuracy * 100:.2f}%\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "3050a6d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for class 'a photo of standard bathroom': 100.00%\n",
+      "Accuracy for class 'a photo of standard kitchen': 100.00%\n",
+      "Accuracy for class 'a photo of contemporary bathroom': 99.15%\n",
+      "Accuracy for class 'a photo of contemporary kitchen': 100.00%\n",
+      "\n",
+      "Mean accuracy over all classes: 99.79%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# accuracy per class\n",
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('kb_val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = AutoModel.from_pretrained(repo_id)\n",
+    "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "\n",
+    "class_accuracy = {}  # Dictionary to track accuracy per class\n",
+    "\n",
+    "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "    batch_images = []\n",
+    "    valid_labels = []\n",
+    "    for path, label in zip(batch_paths, batch_labels):\n",
+    "        try:\n",
+    "            batch_images.append(Image.open(path))\n",
+    "            valid_labels.append(label)\n",
+    "        except (FileNotFoundError, UnidentifiedImageError):\n",
+    "            continue  # Skip images that cannot be opened\n",
+    "\n",
+    "    # Get predictions for the batch of images\n",
+    "    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "    all_predictions.extend(predicted_labels)\n",
+    "    all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "    # Update class accuracy counts\n",
+    "    for true_label, predicted_label in zip(valid_labels, predicted_labels):\n",
+    "        if true_label not in class_accuracy:\n",
+    "            class_accuracy[true_label] = {'correct': 0, 'total': 0}\n",
+    "        class_accuracy[true_label]['total'] += 1\n",
+    "        if true_label == predicted_label:\n",
+    "            class_accuracy[true_label]['correct'] += 1\n",
+    "\n",
+    "# Print accuracy per class and calculate mean accuracy\n",
+    "mean_accuracy = 0\n",
+    "for class_label, counts in class_accuracy.items():\n",
+    "    class_acc = counts['correct'] / counts['total']\n",
+    "    mean_accuracy += class_acc\n",
+    "    print(f\"Accuracy for class '{class_label}': {class_acc * 100:.2f}%\")\n",
+    "mean_accuracy /= len(class_accuracy)\n",
+    "print(f\"\\nMean accuracy over all classes: {mean_accuracy * 100:.2f}%\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "96073841",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for class 'a photo of standard bathroom': 100.00%\n",
+      "Accuracy for class 'a photo of standard kitchen': 100.00%\n",
+      "Accuracy for class 'a photo of contemporary bathroom': 99.15%\n",
+      "Accuracy for class 'a photo of contemporary kitchen': 100.00%\n",
+      "\n",
+      "Mean accuracy over all classes: 99.79%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# accuracy per class\n",
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('kb_val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "dir = \"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2\"\n",
+    "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = AutoModel.from_pretrained(dir)\n",
+    "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "\n",
+    "class_accuracy = {}  # Dictionary to track accuracy per class\n",
+    "\n",
+    "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "    batch_images = []\n",
+    "    valid_labels = []\n",
+    "    for path, label in zip(batch_paths, batch_labels):\n",
+    "        try:\n",
+    "            batch_images.append(Image.open(path))\n",
+    "            valid_labels.append(label)\n",
+    "        except (FileNotFoundError, UnidentifiedImageError):\n",
+    "            continue  # Skip images that cannot be opened\n",
+    "\n",
+    "    # Get predictions for the batch of images\n",
+    "    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "    all_predictions.extend(predicted_labels)\n",
+    "    all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "    # Update class accuracy counts\n",
+    "    for true_label, predicted_label in zip(valid_labels, predicted_labels):\n",
+    "        if true_label not in class_accuracy:\n",
+    "            class_accuracy[true_label] = {'correct': 0, 'total': 0}\n",
+    "        class_accuracy[true_label]['total'] += 1\n",
+    "        if true_label == predicted_label:\n",
+    "            class_accuracy[true_label]['correct'] += 1\n",
+    "\n",
+    "# Print accuracy per class and calculate mean accuracy\n",
+    "mean_accuracy = 0\n",
+    "for class_label, counts in class_accuracy.items():\n",
+    "    class_acc = counts['correct'] / counts['total']\n",
+    "    mean_accuracy += class_acc\n",
+    "    print(f\"Accuracy for class '{class_label}': {class_acc * 100:.2f}%\")\n",
+    "mean_accuracy /= len(class_accuracy)\n",
+    "print(f\"\\nMean accuracy over all classes: {mean_accuracy * 100:.2f}%\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a1e34089",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 1: 99.73%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 2: 99.87%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 3: 99.87%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 4: 99.87%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 5: 99.87%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 6: 99.87%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 7: 99.87%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 8: 99.87%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain9...\n",
+      "Accuracy for model 9: 99.87%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('kb_val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 128  # Adjust based on your available memory\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "# Loop for different finetuned models\n",
+    "for i in range(1, 10):  # Assuming you have 9 finetuned models\n",
+    "    dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}\"\n",
+    "    print(f\"Evaluating with model from {dir}...\")\n",
+    "\n",
+    "    image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "    model = AutoModel.from_pretrained(dir)\n",
+    "    clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                             device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))\n",
+    "\n",
+    "    all_predictions = []\n",
+    "    all_true_labels = []\n",
+    "\n",
+    "    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "        batch_images = []\n",
+    "        valid_labels = []\n",
+    "        for path, label in zip(batch_paths, batch_labels):\n",
+    "            try:\n",
+    "                batch_images.append(Image.open(path))\n",
+    "                valid_labels.append(label)\n",
+    "            except (FileNotFoundError, UnidentifiedImageError):\n",
+    "                continue  # Skip images that cannot be opened\n",
+    "\n",
+    "        # Get predictions for the batch of images\n",
+    "        predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "        predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "        all_predictions.extend(predicted_labels)\n",
+    "        all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "    accuracy = correct_predictions / len(all_true_labels)\n",
+    "    print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb4d1e91",
+   "metadata": {},
+   "source": [
+    "# quantization small model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c417e784",
+   "metadata": {
+    "jupyter": {
+     "is_executing": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "\n",
+    "# Make only the first GPU visible (GPU 0)\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"\"\n",
+    "\n",
+    "# Check if CUDA (GPU support) is available\n",
+    "if torch.cuda.is_available():\n",
+    "    print(f\"CUDA is available. Number of GPUs: {torch.cuda.device_count()}\")\n",
+    "    \n",
+    "    # Loop through and print details of each GPU\n",
+    "    for i in range(torch.cuda.device_count()):\n",
+    "        print(f\"GPU {i}: {torch.cuda.get_device_name(i)}\")\n",
+    "else:\n",
+    "    print(\"CUDA is not available. Only CPU will be used.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d9081f63",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "meta\n"
+     ]
+    }
+   ],
+   "source": [
+    "from accelerate.utils import BnbQuantizationConfig\n",
+    "bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)\n",
+    "from accelerate.utils import load_and_quantize_model\n",
+    "from transformers import CLIPProcessor, CLIPModel, AutoModel\n",
+    "from accelerate import init_empty_weights\n",
+    "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification\n",
+    "config = AutoConfig.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "with init_empty_weights():\n",
+    "    empty_model = AutoModelForZeroShotImageClassification.from_config(config)\n",
+    "    #CLIP(config=AutoConfig.from_pretrained(\"openai/clip-vit-large-patch14\"))\n",
+    "\n",
+    "# Move the model to GPU 0\n",
+    "print(empty_model.device)\n",
+    "dir = \"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2\"\n",
+    "model = AutoModel.from_pretrained(dir)\n",
+    "from huggingface_hub import snapshot_download\n",
+    "weights_location = \"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2/model.safetensors\"\n",
+    "quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map = \"auto\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "2ad06683",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 0: 96.81%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import csv\n",
+    "from transformers import CLIPProcessor, CLIPModel, pipeline, CLIPImageProcessor\n",
+    "from PIL import Image\n",
+    "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoImageProcessor\n",
+    "import matplotlib.pyplot as plt\n",
+    "import time\n",
+    "import json\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('kb_val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "repo_id =  \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "dir = \"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2\"\n",
+    "image_processor = AutoImageProcessor.from_pretrained(\n",
+    "    repo_id\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, \n",
+    "                                          config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = quantized_model\n",
+    "clip_pipeline = pipeline(model=model,task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                    image_processor=image_processor, config=AutoConfig.from_pretrained(dir),\n",
+    "                        device_map=\"auto\", model_kwargs={\"load_in_8bit\": True})\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "BATCH_SIZE = 256  # Adjust based on your available memory\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "time_eval = []\n",
+    "\n",
+    "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "    batch_images = []\n",
+    "    valid_labels = []\n",
+    "    for path, label in zip(batch_paths, batch_labels):\n",
+    "        try:\n",
+    "            batch_images.append(Image.open(path))\n",
+    "            valid_labels.append(label)\n",
+    "        except (FileNotFoundError, UnidentifiedImageError):\n",
+    "            continue  # Skip images that cannot be opened\n",
+    "\n",
+    "    # Get predictions for the batch of images\n",
+    "    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "    all_predictions.extend(predicted_labels)\n",
+    "    all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "accuracy = correct_predictions / len(all_true_labels)\n",
+    "print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e179576d",
+   "metadata": {},
+   "source": [
+    "## random testing result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e89c9a67",
+   "metadata": {},
+   "source": [
+    "### original model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "dbfe9e0f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "meta\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "55ac2f92e769452997f907f0301ddf14",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from accelerate.utils import BnbQuantizationConfig\n",
+    "bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)\n",
+    "from accelerate.utils import load_and_quantize_model\n",
+    "from transformers import CLIPProcessor, CLIPModel, AutoModel\n",
+    "from accelerate import init_empty_weights\n",
+    "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification\n",
+    "config = AutoConfig.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "with init_empty_weights():\n",
+    "    empty_model = AutoModelForZeroShotImageClassification.from_config(config)\n",
+    "    #CLIP(config=AutoConfig.from_pretrained(\"openai/clip-vit-large-patch14\"))\n",
+    "\n",
+    "# Move the model to GPU 0\n",
+    "print(empty_model.device)\n",
+    "model = AutoModel.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "from huggingface_hub import snapshot_download\n",
+    "weights_location = snapshot_download(repo_id=\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, \n",
+    "                                          bnb_quantization_config=bnb_quantization_config, device_map = \"auto\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "49901e70",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n",
+      "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/pipelines/base.py:1123: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 9: 85.71%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import csv\n",
+    "from transformers import CLIPProcessor, CLIPModel, pipeline, CLIPImageProcessor\n",
+    "from PIL import Image\n",
+    "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoImageProcessor\n",
+    "import matplotlib.pyplot as plt\n",
+    "import time\n",
+    "import json\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('val_random.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "repo_id =  \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "image_processor = AutoImageProcessor.from_pretrained(\n",
+    "    repo_id\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, \n",
+    "                                          config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = quantized_model\n",
+    "clip_pipeline = pipeline(model=model,task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                    image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id),\n",
+    "                        device_map=\"auto\", model_kwargs={\"load_in_8bit\": True})\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "BATCH_SIZE = 256  # Adjust based on your available memory\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "time_eval = []\n",
+    "\n",
+    "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "    batch_images = []\n",
+    "    valid_labels = []\n",
+    "    for path, label in zip(batch_paths, batch_labels):\n",
+    "        try:\n",
+    "            batch_images.append(Image.open(path))\n",
+    "            valid_labels.append(label)\n",
+    "        except (FileNotFoundError, UnidentifiedImageError):\n",
+    "            continue  # Skip images that cannot be opened\n",
+    "\n",
+    "    # Get predictions for the batch of images\n",
+    "    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "    all_predictions.extend(predicted_labels)\n",
+    "    all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "accuracy = correct_predictions / len(all_true_labels)\n",
+    "print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "adf46546",
+   "metadata": {},
+   "source": [
+    "### our trained models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b68282a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain1...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 1: 97.06%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain2...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 2: 97.15%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain3...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 3: 98.11%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain4...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 4: 96.90%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain5...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 5: 96.60%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain6...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 6: 96.27%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain7...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 7: 95.59%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain8...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 8: 94.62%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain9...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 9: 92.34%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model\n",
+    "from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoConfig, AutoModelForZeroShotImageClassification, pipeline, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import json\n",
+    "\n",
+    "# Load the JSON data for evaluation\n",
+    "with open('val_random.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 256  # Adjust based on your available memory\n",
+    "\n",
+    "# Define a function to process images in chunks\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "# Base configuration\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)\n",
+    "\n",
+    "# Iterate over the 10 models\n",
+    "for i in range(1, 10):  # Assuming you have 19 finetuned models\n",
+    "    dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain{i}\"\n",
+    "    print(f\"Evaluating with model from {dir}...\")\n",
+    "    \n",
+    "    # Load and quantize model\n",
+    "    with init_empty_weights():\n",
+    "        empty_model = AutoModelForZeroShotImageClassification.from_config(AutoConfig.from_pretrained(repo_id))\n",
+    "\n",
+    "    # Load the model from the directory\n",
+    "    weights_location = f\"{dir}/model.safetensors\"\n",
+    "    quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map=\"auto\")\n",
+    "\n",
+    "    # Initialize tokenizer and processor\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(repo_id)\n",
+    "    image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "    \n",
+    "    # Initialize the pipeline without the device argument\n",
+    "    clip_pipeline = pipeline(\n",
+    "        model=quantized_model,\n",
+    "        task=\"zero-shot-image-classification\",\n",
+    "        tokenizer=tokenizer,\n",
+    "        image_processor=image_processor,\n",
+    "        config=AutoConfig.from_pretrained(repo_id),\n",
+    "        model_kwargs={\"load_in_8bit\": True}\n",
+    "    )\n",
+    "\n",
+    "    all_predictions = []\n",
+    "    all_true_labels = []\n",
+    "\n",
+    "        # Process images in chunks and evaluate\n",
+    "    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "        batch_images = []\n",
+    "        valid_labels = []\n",
+    "        for path, label in zip(batch_paths, batch_labels):\n",
+    "            try:\n",
+    "                # Open image and append to batch_images\n",
+    "                with Image.open(path) as img:\n",
+    "                    batch_images.append(img.convert(\"RGB\"))\n",
+    "                valid_labels.append(label)\n",
+    "            except (FileNotFoundError, UnidentifiedImageError):\n",
+    "                # Skip images that cannot be opened\n",
+    "                continue\n",
+    "\n",
+    "        # Get predictions for the batch of images\n",
+    "        try:\n",
+    "            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "            all_predictions.extend(predicted_labels)\n",
+    "            all_true_labels.extend(valid_labels)\n",
+    "        except Exception as e:\n",
+    "            print(f\"An error occurred during prediction: {e}\")\n",
+    "            continue\n",
+    "\n",
+    "    # Calculate the accuracy for the current model\n",
+    "    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "    accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0\n",
+    "    print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9e1e9d4",
+   "metadata": {},
+   "source": [
+    "## kitchen and bathroom result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "ed6206d6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "meta\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c9a9d50e660a48e6b7c76abdb72479cf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 0: 95.08%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "#original model\n",
+    "from accelerate.utils import BnbQuantizationConfig\n",
+    "bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)\n",
+    "from accelerate.utils import load_and_quantize_model\n",
+    "from transformers import CLIPProcessor, CLIPModel, AutoModel\n",
+    "from accelerate import init_empty_weights\n",
+    "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification\n",
+    "config = AutoConfig.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "with init_empty_weights():\n",
+    "    empty_model = AutoModelForZeroShotImageClassification.from_config(config)\n",
+    "    #CLIP(config=AutoConfig.from_pretrained(\"openai/clip-vit-large-patch14\"))\n",
+    "\n",
+    "# Move the model to GPU 0\n",
+    "print(empty_model.device)\n",
+    "model = AutoModel.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "from huggingface_hub import snapshot_download\n",
+    "weights_location = snapshot_download(repo_id=\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, \n",
+    "                                          bnb_quantization_config=bnb_quantization_config, device_map = \"auto\")\n",
+    "\n",
+    "import os\n",
+    "import csv\n",
+    "from transformers import CLIPProcessor, CLIPModel, pipeline, CLIPImageProcessor\n",
+    "from PIL import Image\n",
+    "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoImageProcessor\n",
+    "import matplotlib.pyplot as plt\n",
+    "import time\n",
+    "import json\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('kb_val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "repo_id =  \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "image_processor = AutoImageProcessor.from_pretrained(\n",
+    "    repo_id\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, \n",
+    "                                          config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = quantized_model\n",
+    "clip_pipeline = pipeline(model=model,task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                    image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id),\n",
+    "                        device_map=\"auto\", model_kwargs={\"load_in_8bit\": True})\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "BATCH_SIZE = 256  # Adjust based on your available memory\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "time_eval = []\n",
+    "\n",
+    "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "    batch_images = []\n",
+    "    valid_labels = []\n",
+    "    for path, label in zip(batch_paths, batch_labels):\n",
+    "        try:\n",
+    "            batch_images.append(Image.open(path))\n",
+    "            valid_labels.append(label)\n",
+    "        except (FileNotFoundError, UnidentifiedImageError):\n",
+    "            continue  # Skip images that cannot be opened\n",
+    "\n",
+    "    # Get predictions for the batch of images\n",
+    "    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "    all_predictions.extend(predicted_labels)\n",
+    "    all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "accuracy = correct_predictions / len(all_true_labels)\n",
+    "print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7d162fe9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain1...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 1: 97.07%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain2...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 2: 96.94%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain3...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 3: 96.94%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain4...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 4: 96.94%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain5...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 5: 96.81%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain6...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 6: 96.54%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain7...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 7: 96.01%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain8...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 8: 96.01%\n",
+      "\n",
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain9...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 9: 95.74%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model\n",
+    "from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoConfig, AutoModelForZeroShotImageClassification, pipeline, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import json\n",
+    "\n",
+    "# Load the JSON data for evaluation\n",
+    "with open('kb_val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 256  # Adjust based on your available memory\n",
+    "\n",
+    "# Define a function to process images in chunks\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "# Base configuration\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)\n",
+    "\n",
+    "# Iterate over the 10 models\n",
+    "for i in range(1, 10):  # Assuming you have 19 finetuned models\n",
+    "    dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain{i}\"\n",
+    "    print(f\"Evaluating with model from {dir}...\")\n",
+    "    \n",
+    "    # Load and quantize model\n",
+    "    with init_empty_weights():\n",
+    "        empty_model = AutoModelForZeroShotImageClassification.from_config(AutoConfig.from_pretrained(repo_id))\n",
+    "\n",
+    "    # Load the model from the directory\n",
+    "    weights_location = f\"{dir}/model.safetensors\"\n",
+    "    quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map=\"auto\")\n",
+    "\n",
+    "    # Initialize tokenizer and processor\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(repo_id)\n",
+    "    image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "    \n",
+    "    # Initialize the pipeline without the device argument\n",
+    "    clip_pipeline = pipeline(\n",
+    "        model=quantized_model,\n",
+    "        task=\"zero-shot-image-classification\",\n",
+    "        tokenizer=tokenizer,\n",
+    "        image_processor=image_processor,\n",
+    "        config=AutoConfig.from_pretrained(repo_id),\n",
+    "        model_kwargs={\"load_in_8bit\": True}\n",
+    "    )\n",
+    "\n",
+    "    all_predictions = []\n",
+    "    all_true_labels = []\n",
+    "\n",
+    "        # Process images in chunks and evaluate\n",
+    "    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "        batch_images = []\n",
+    "        valid_labels = []\n",
+    "        for path, label in zip(batch_paths, batch_labels):\n",
+    "            try:\n",
+    "                # Open image and append to batch_images\n",
+    "                with Image.open(path) as img:\n",
+    "                    batch_images.append(img.convert(\"RGB\"))\n",
+    "                valid_labels.append(label)\n",
+    "            except (FileNotFoundError, UnidentifiedImageError):\n",
+    "                # Skip images that cannot be opened\n",
+    "                continue\n",
+    "\n",
+    "        # Get predictions for the batch of images\n",
+    "        try:\n",
+    "            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "            all_predictions.extend(predicted_labels)\n",
+    "            all_true_labels.extend(valid_labels)\n",
+    "        except Exception as e:\n",
+    "            print(f\"An error occurred during prediction: {e}\")\n",
+    "            continue\n",
+    "\n",
+    "    # Calculate the accuracy for the current model\n",
+    "    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "    accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0\n",
+    "    print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53455e92",
+   "metadata": {},
+   "source": [
+    "# quantize 4 bit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "cade5dcc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "meta\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "611e882b72c84264aeb9710646e90ea9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "\n",
+    "from accelerate.utils import BnbQuantizationConfig\n",
+    "bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)\n",
+    "from accelerate.utils import load_and_quantize_model\n",
+    "from transformers import CLIPProcessor, CLIPModel, AutoModel\n",
+    "from accelerate import init_empty_weights\n",
+    "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification\n",
+    "config = AutoConfig.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "with init_empty_weights():\n",
+    "    empty_model = AutoModelForZeroShotImageClassification.from_config(config)\n",
+    "    #CLIP(config=AutoConfig.from_pretrained(\"openai/clip-vit-large-patch14\"))\n",
+    "\n",
+    "# Move the model to GPU 0\n",
+    "print(empty_model.device)\n",
+    "model = AutoModel.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "from huggingface_hub import snapshot_download\n",
+    "weights_location = snapshot_download(repo_id=\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, \n",
+    "                                          bnb_quantization_config=bnb_quantization_config, device_map = \"auto\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "57d05d7b",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Unrecognized configuration class <class 'transformers.models.clip.configuration_clip.CLIPConfig'> for this kind of AutoModel: AutoModelForCausalLM.\nModel type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, LlamaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MvpConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[26], line 8\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification\n\u001b[1;32m      7\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlaion/CLIP-ViT-B-32-laion2B-s34B-b79K\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 8\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mAutoModelForCausalLM\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlaion/CLIP-ViT-B-32-laion2B-s34B-b79K\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mload_in_4bit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py:569\u001b[0m, in \u001b[0;36m_BaseAutoModelClass.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m    565\u001b[0m     model_class \u001b[38;5;241m=\u001b[39m _get_model_class(config, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping)\n\u001b[1;32m    566\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m model_class\u001b[38;5;241m.\u001b[39mfrom_pretrained(\n\u001b[1;32m    567\u001b[0m         pretrained_model_name_or_path, \u001b[38;5;241m*\u001b[39mmodel_args, config\u001b[38;5;241m=\u001b[39mconfig, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mhub_kwargs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[1;32m    568\u001b[0m     )\n\u001b[0;32m--> 569\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    570\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnrecognized configuration class \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconfig\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for this kind of AutoModel: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    571\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mModel type should be one of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(c\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mfor\u001b[39;00m\u001b[38;5;250m \u001b[39mc\u001b[38;5;250m \u001b[39m\u001b[38;5;129;01min\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping\u001b[38;5;241m.\u001b[39mkeys())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    572\u001b[0m )\n",
+      "\u001b[0;31mValueError\u001b[0m: Unrecognized configuration class <class 'transformers.models.clip.configuration_clip.CLIPConfig'> for this kind of AutoModel: AutoModelForCausalLM.\nModel type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, LlamaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MvpConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig."
+     ]
+    }
+   ],
+   "source": [
+    "from accelerate.utils import BnbQuantizationConfig\n",
+    "bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)\n",
+    "from accelerate.utils import load_and_quantize_model\n",
+    "from transformers import CLIPProcessor, CLIPModel, AutoModelForCausalLM\n",
+    "from accelerate import init_empty_weights\n",
+    "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\", device_map=\"auto\", load_in_4bit=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4f624f06",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "meta\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d5d0038ac93240ac849c30801d427222",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model 9: 88.67%\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "#original model\n",
+    "from accelerate.utils import BnbQuantizationConfig\n",
+    "from accelerate.utils import load_and_quantize_model\n",
+    "from transformers import CLIPProcessor, CLIPModel, AutoModel\n",
+    "from accelerate import init_empty_weights\n",
+    "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification\n",
+    "config = AutoConfig.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "with init_empty_weights():\n",
+    "    empty_model = AutoModelForZeroShotImageClassification.from_config(config)\n",
+    "    #CLIP(config=AutoConfig.from_pretrained(\"openai/clip-vit-large-patch14\"))\n",
+    "\n",
+    "# Move the model to GPU 0\n",
+    "print(empty_model.device)\n",
+    "model = AutoModel.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "from huggingface_hub import snapshot_download\n",
+    "weights_location = snapshot_download(repo_id=\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n",
+    "quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, \n",
+    "                                          bnb_quantization_config=bnb_quantization_config, device_map = \"auto\")\n",
+    "\n",
+    "import os\n",
+    "import csv\n",
+    "from transformers import CLIPProcessor, CLIPModel, pipeline, CLIPImageProcessor\n",
+    "from PIL import Image\n",
+    "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoImageProcessor\n",
+    "import matplotlib.pyplot as plt\n",
+    "import time\n",
+    "import json\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('val_random.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "repo_id =  \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "image_processor = AutoImageProcessor.from_pretrained(\n",
+    "    repo_id\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, \n",
+    "                                          config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = quantized_model\n",
+    "clip_pipeline = pipeline(model=model,task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                    image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id),\n",
+    "                        device_map=\"auto\", model_kwargs={\"load_in_8bit\": True})\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "BATCH_SIZE = 256  # Adjust based on your available memory\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "time_eval = []\n",
+    "\n",
+    "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "    batch_images = []\n",
+    "    valid_labels = []\n",
+    "    for path, label in zip(batch_paths, batch_labels):\n",
+    "        try:\n",
+    "            batch_images.append(Image.open(path))\n",
+    "            valid_labels.append(label)\n",
+    "        except (FileNotFoundError, UnidentifiedImageError):\n",
+    "            continue  # Skip images that cannot be opened\n",
+    "\n",
+    "    # Get predictions for the batch of images\n",
+    "    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "    all_predictions.extend(predicted_labels)\n",
+    "    all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "accuracy = correct_predictions / len(all_true_labels)\n",
+    "print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "5ae46848",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain1...\n"
+     ]
+    },
+    {
+     "ename": "NotImplementedError",
+     "evalue": "Cannot copy out of meta tensor; no data!",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNotImplementedError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[22], line 36\u001b[0m\n\u001b[1;32m     34\u001b[0m \u001b[38;5;66;03m# Load the model from the directory\u001b[39;00m\n\u001b[1;32m     35\u001b[0m weights_location \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mdir\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model.safetensors\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 36\u001b[0m quantized_model \u001b[38;5;241m=\u001b[39m \u001b[43mload_and_quantize_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mempty_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweights_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweights_location\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbnb_quantization_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbnb_quantization_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     38\u001b[0m \u001b[38;5;66;03m# Initialize tokenizer and processor\u001b[39;00m\n\u001b[1;32m     39\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(repo_id)\n",
+      "File \u001b[0;32m~/anaconda3/envs/huggingface/lib/python3.10/site-packages/accelerate/utils/bnb.py:189\u001b[0m, in \u001b[0;36mload_and_quantize_model\u001b[0;34m(model, bnb_quantization_config, weights_location, device_map, no_split_module_classes, max_memory, offload_folder, offload_state_dict)\u001b[0m\n\u001b[1;32m    185\u001b[0m     offload_state_dict \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m    187\u001b[0m offload \u001b[38;5;241m=\u001b[39m \u001b[38;5;28many\u001b[39m(x \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(device_map\u001b[38;5;241m.\u001b[39mvalues()) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdisk\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[0;32m--> 189\u001b[0m \u001b[43mload_checkpoint_in_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    190\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    191\u001b[0m \u001b[43m    \u001b[49m\u001b[43mweights_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    192\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    193\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbnb_quantization_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtorch_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    194\u001b[0m \u001b[43m    \u001b[49m\u001b[43moffload_folder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffload_folder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    195\u001b[0m \u001b[43m    \u001b[49m\u001b[43moffload_state_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffload_state_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    196\u001b[0m \u001b[43m    \u001b[49m\u001b[43mkeep_in_fp32_modules\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbnb_quantization_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkeep_in_fp32_modules\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    197\u001b[0m \u001b[43m    \u001b[49m\u001b[43moffload_8bit_bnb\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mload_in_8bit\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mand\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43moffload\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    198\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    199\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m dispatch_model(model, device_map\u001b[38;5;241m=\u001b[39mdevice_map, offload_dir\u001b[38;5;241m=\u001b[39moffload_folder)\n",
+      "File \u001b[0;32m~/anaconda3/envs/huggingface/lib/python3.10/site-packages/accelerate/utils/modeling.py:1336\u001b[0m, in \u001b[0;36mload_checkpoint_in_model\u001b[0;34m(model, checkpoint, device_map, offload_folder, dtype, offload_state_dict, offload_buffers, keep_in_fp32_modules, offload_8bit_bnb)\u001b[0m\n\u001b[1;32m   1334\u001b[0m                 offload_weight(param, param_name, state_dict_folder, index\u001b[38;5;241m=\u001b[39mstate_dict_index)\n\u001b[1;32m   1335\u001b[0m         \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1336\u001b[0m             \u001b[43mset_module_tensor_to_device\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1337\u001b[0m \u001b[43m                \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1338\u001b[0m \u001b[43m                \u001b[49m\u001b[43mparam_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1339\u001b[0m \u001b[43m                \u001b[49m\u001b[43mparam_device\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1340\u001b[0m \u001b[43m                \u001b[49m\u001b[43mvalue\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparam\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1341\u001b[0m \u001b[43m                \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnew_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1342\u001b[0m \u001b[43m                \u001b[49m\u001b[43mfp16_statistics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfp16_statistics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1343\u001b[0m \u001b[43m            \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1345\u001b[0m \u001b[38;5;66;03m# Force Python to clean up.\u001b[39;00m\n\u001b[1;32m   1346\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m checkpoint\n",
+      "File \u001b[0;32m~/anaconda3/envs/huggingface/lib/python3.10/site-packages/accelerate/utils/modeling.py:344\u001b[0m, in \u001b[0;36mset_module_tensor_to_device\u001b[0;34m(module, tensor_name, device, value, dtype, fp16_statistics)\u001b[0m\n\u001b[1;32m    342\u001b[0m             device_index \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mdevice(device)\u001b[38;5;241m.\u001b[39mindex \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mdevice(device)\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    343\u001b[0m             \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(module\u001b[38;5;241m.\u001b[39mweight, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquant_state\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;129;01mand\u001b[39;00m device_index \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 344\u001b[0m                 module\u001b[38;5;241m.\u001b[39mweight \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcuda\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice_index\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    345\u001b[0m \u001b[38;5;66;03m# clean pre and post foward hook\u001b[39;00m\n\u001b[1;32m    346\u001b[0m torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mempty_cache()\n",
+      "File \u001b[0;32m~/anaconda3/envs/huggingface/lib/python3.10/site-packages/bitsandbytes/nn/modules.py:168\u001b[0m, in \u001b[0;36mParams4bit.cuda\u001b[0;34m(self, device)\u001b[0m\n\u001b[1;32m    167\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcuda\u001b[39m(\u001b[38;5;28mself\u001b[39m, device):\n\u001b[0;32m--> 168\u001b[0m     w \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontiguous\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhalf\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcuda\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    169\u001b[0m     w_4bit, quant_state \u001b[38;5;241m=\u001b[39m bnb\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39mquantize_4bit(w, blocksize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblocksize, compress_statistics\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompress_statistics, quant_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mquant_type)\n\u001b[1;32m    170\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdata \u001b[38;5;241m=\u001b[39m w_4bit\n",
+      "\u001b[0;31mNotImplementedError\u001b[0m: Cannot copy out of meta tensor; no data!"
+     ]
+    }
+   ],
+   "source": [
+    "from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model\n",
+    "from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoConfig, AutoModelForZeroShotImageClassification, pipeline, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import json\n",
+    "\n",
+    "# Load the JSON data for evaluation\n",
+    "with open('val_random.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 256  # Adjust based on your available memory\n",
+    "\n",
+    "# Define a function to process images in chunks\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "# Base configuration\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "# Iterate over the 10 models\n",
+    "for i in range(1, 10):  # Assuming you have 19 finetuned models\n",
+    "    dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain{i}\"\n",
+    "    print(f\"Evaluating with model from {dir}...\")\n",
+    "    \n",
+    "    # Load and quantize model\n",
+    "    with init_empty_weights():\n",
+    "        empty_model = AutoModelForZeroShotImageClassification.from_config(AutoConfig.from_pretrained(repo_id))\n",
+    "\n",
+    "    # Load the model from the directory\n",
+    "    weights_location = f\"{dir}/model.safetensors\"\n",
+    "    quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map=\"auto\")\n",
+    "\n",
+    "    # Initialize tokenizer and processor\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(repo_id)\n",
+    "    image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "    \n",
+    "    # Initialize the pipeline without the device argument\n",
+    "    clip_pipeline = pipeline(\n",
+    "        model=quantized_model,\n",
+    "        task=\"zero-shot-image-classification\",\n",
+    "        tokenizer=tokenizer,\n",
+    "        image_processor=image_processor,\n",
+    "        config=AutoConfig.from_pretrained(repo_id),\n",
+    "        model_kwargs={\"load_in_8bit\": True}\n",
+    "    )\n",
+    "\n",
+    "    all_predictions = []\n",
+    "    all_true_labels = []\n",
+    "\n",
+    "        # Process images in chunks and evaluate\n",
+    "    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "        batch_images = []\n",
+    "        valid_labels = []\n",
+    "        for path, label in zip(batch_paths, batch_labels):\n",
+    "            try:\n",
+    "                # Open image and append to batch_images\n",
+    "                with Image.open(path) as img:\n",
+    "                    batch_images.append(img.convert(\"RGB\"))\n",
+    "                valid_labels.append(label)\n",
+    "            except (FileNotFoundError, UnidentifiedImageError):\n",
+    "                # Skip images that cannot be opened\n",
+    "                continue\n",
+    "\n",
+    "        # Get predictions for the batch of images\n",
+    "        try:\n",
+    "            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "            all_predictions.extend(predicted_labels)\n",
+    "            all_true_labels.extend(valid_labels)\n",
+    "        except Exception as e:\n",
+    "            print(f\"An error occurred during prediction: {e}\")\n",
+    "            continue\n",
+    "\n",
+    "    # Calculate the accuracy for the current model\n",
+    "    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "    accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0\n",
+    "    print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2dd47a7b",
+   "metadata": {},
+   "source": [
+    "# Efficiency comparison"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b7a2d41b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for model org: 73.67%\n",
+      "\n",
+      "Accuracy for each category:\n",
+      "a photo of standard bathroom: 0.92\n",
+      "a photo of standard kitchen: 0.98\n",
+      "a photo of contemporary bathroom: 0.52\n",
+      "a photo of contemporary kitchen: 0.39\n",
+      "                           Category  Accuracy\n",
+      "0      a photo of standard bathroom  0.920690\n",
+      "1       a photo of standard kitchen  0.980892\n",
+      "2  a photo of contemporary bathroom  0.516949\n",
+      "3   a photo of contemporary kitchen  0.385027\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "STAGE:2024-04-22 17:39:10 514141:514141 ActivityProfilerController.cpp:311] Completed Stage: Warm Up\n",
+      "STAGE:2024-04-22 17:39:10 514141:514141 ActivityProfilerController.cpp:317] Completed Stage: Collection\n",
+      "STAGE:2024-04-22 17:39:10 514141:514141 ActivityProfilerController.cpp:321] Completed Stage: Post Processing\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from collections import Counter\n",
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import torch\n",
+    "import torch.profiler\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "def calculate_category_accuracy(true_labels, predicted_labels):\n",
+    "    \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n",
+    "    accuracies = {}\n",
+    "    true_labels_counter = Counter(true_labels)\n",
+    "    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n",
+    "    \n",
+    "    for label in true_labels_counter:\n",
+    "        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n",
+    "        accuracies[label] = accuracy\n",
+    "    \n",
+    "    return accuracies\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('kb_val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 1024  # Adjust based on your available memory\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "\n",
+    "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = AutoModel.from_pretrained(repo_id).cuda()\n",
+    "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "\n",
+    "def trace_handler(profiler):\n",
+    "    print(\"Trace handler called\")\n",
+    "    try:\n",
+    "        print(profiler.key_averages().table(sort_by=\"cpu_time_total\", row_limit=10))\n",
+    "        profiler.export_chrome_trace(\"trace.json\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error in trace handler: {str(e)}\")\n",
+    "# Start profiling\n",
+    "with torch.profiler.profile(\n",
+    "    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],\n",
+    "    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),\n",
+    "    on_trace_ready=trace_handler,\n",
+    "    record_shapes=True,\n",
+    "    profile_memory=True,\n",
+    "    with_stack=True\n",
+    ") as profiler:\n",
+    "    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "        batch_images = []\n",
+    "        valid_labels = []\n",
+    "        for path, label in zip(batch_paths, batch_labels):\n",
+    "            try:\n",
+    "                batch_images.append(Image.open(path).convert('RGB'))\n",
+    "                valid_labels.append(label)\n",
+    "            except (FileNotFoundError, UnidentifiedImageError):\n",
+    "                print(f\"Skipping file: {path}, unable to open or not found.\")\n",
+    "                continue  # Skip images that cannot be opened\n",
+    "\n",
+    "        if batch_images:  # Ensure there are images to predict\n",
+    "            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "            all_predictions.extend(predicted_labels)\n",
+    "            all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "        profiler.step()  # Advance the profiler\n",
+    "\n",
+    "# Calculate and print overall accuracy\n",
+    "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0\n",
+    "print(f\"Accuracy for model org: {accuracy * 100:.2f}%\\n\")\n",
+    "\n",
+    "# Calculate and print category accuracies\n",
+    "category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n",
+    "print(\"Accuracy for each category:\")\n",
+    "for category, accuracy in category_accuracies.items():\n",
+    "    print(f\"{category}: {accuracy:.2f}\")\n",
+    "\n",
+    "# Convert the dictionary to a DataFrame\n",
+    "category_accuracy_df = pd.DataFrame(list(category_accuracies.items()), columns=['Category', 'Accuracy'])\n",
+    "\n",
+    "print(category_accuracy_df)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1595577",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from collections import Counter\n",
+    "import os\n",
+    "import json\n",
+    "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n",
+    "from PIL import Image, UnidentifiedImageError\n",
+    "import torch\n",
+    "import torch.profiler\n",
+    "\n",
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "def calculate_category_accuracy(true_labels, predicted_labels):\n",
+    "    \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n",
+    "    accuracies = {}\n",
+    "    true_labels_counter = Counter(true_labels)\n",
+    "    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n",
+    "    \n",
+    "    for label in true_labels_counter:\n",
+    "        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n",
+    "        accuracies[label] = accuracy\n",
+    "    \n",
+    "    return accuracies\n",
+    "\n",
+    "# Load the JSON data\n",
+    "with open('kb_val.json', 'r') as f:\n",
+    "    data = [json.loads(line) for line in f]\n",
+    "\n",
+    "# Extract image paths and labels\n",
+    "image_paths = [item['image'] for item in data]\n",
+    "labels = [item['caption'] for item in data]\n",
+    "\n",
+    "BATCH_SIZE = 1024  # Adjust based on your available memory\n",
+    "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n",
+    "model_dir = f\"./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold3\"\n",
+    "\n",
+    "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n",
+    "model = AutoModel.from_pretrained(model_dir).cuda()\n",
+    "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n",
+    "                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(model_dir))\n",
+    "\n",
+    "all_predictions = []\n",
+    "all_true_labels = []\n",
+    "\n",
+    "def trace_handler(profiler):\n",
+    "    print(\"Trace handler called\")\n",
+    "    try:\n",
+    "        print(profiler.key_averages().table(sort_by=\"cpu_time_total\", row_limit=10))\n",
+    "        profiler.export_chrome_trace(\"trace.json\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error in trace handler: {str(e)}\")\n",
+    "# Start profiling\n",
+    "with torch.profiler.profile(\n",
+    "    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],\n",
+    "    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),\n",
+    "    on_trace_ready=trace_handler,\n",
+    "    record_shapes=True,\n",
+    "    profile_memory=True,\n",
+    "    with_stack=True\n",
+    ") as profiler:\n",
+    "    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n",
+    "        batch_images = []\n",
+    "        valid_labels = []\n",
+    "        for path, label in zip(batch_paths, batch_labels):\n",
+    "            try:\n",
+    "                batch_images.append(Image.open(path).convert('RGB'))\n",
+    "                valid_labels.append(label)\n",
+    "            except (FileNotFoundError, UnidentifiedImageError):\n",
+    "                print(f\"Skipping file: {path}, unable to open or not found.\")\n",
+    "                continue  # Skip images that cannot be opened\n",
+    "\n",
+    "        if batch_images:  # Ensure there are images to predict\n",
+    "            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n",
+    "            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction\n",
+    "\n",
+    "            all_predictions.extend(predicted_labels)\n",
+    "            all_true_labels.extend(valid_labels)\n",
+    "\n",
+    "        profiler.step()  # Advance the profiler\n",
+    "\n",
+    "# Calculate and print overall accuracy\n",
+    "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n",
+    "accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0\n",
+    "print(f\"Accuracy for model org: {accuracy * 100:.2f}%\\n\")\n",
+    "\n",
+    "# Calculate and print category accuracies\n",
+    "category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n",
+    "print(\"Accuracy for each category:\")\n",
+    "for category, accuracy in category_accuracies.items():\n",
+    "    print(f\"{category}: {accuracy:.2f}\")\n",
+    "\n",
+    "# Convert the dictionary to a DataFrame\n",
+    "category_accuracy_df = pd.DataFrame(list(category_accuracies.items()), columns=['Category', 'Accuracy'])\n",
+    "\n",
+    "print(category_accuracy_df)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f35e96a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe5592a4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}