diff --git "a/training_testing_logs.ipynb" "b/training_testing_logs.ipynb" new file mode 100644--- /dev/null +++ "b/training_testing_logs.ipynb" @@ -0,0 +1,9585 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a534038a", + "metadata": {}, + "source": [ + "# random test 20%" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9041ce76", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "import json\n", + "\n", + "def prepare_data_from_dataframe(df: pd.DataFrame, output_file: str) -> str:\n", + " \"\"\"Prepare data for finetuning by reading from a DataFrame.\"\"\"\n", + " data = []\n", + "\n", + " # Process data\n", + " for _, row in df.iterrows():\n", + " image_path = f\"../data/{row['max_key']}/{row['path'].split('/')[-1]}\"\n", + " data.append({\"image\": image_path, \"caption\": row['max_key']})\n", + "\n", + " # Save the data in JSON format\n", + " with open(output_file, \"w\") as f:\n", + " for item in data:\n", + " json.dump(item, f)\n", + " f.write(\"\\n\")\n", + "\n", + " return output_file\n", + "\n", + "\n", + "# Load the CSV file\n", + "df = pd.read_csv('labels.csv')\n", + "# Filter out specific categories\n", + "df = df[df['max_key'] != 'error']\n", + "df = df[df['max_key'] != 'a photo of other indoor space: not kitchen, not bathroom, not living room, not dining room, not foyer']\n", + "df = df[df['max_key'] != 'it is a artificial photo']\n", + "df = df[df['max_key'] != 'a photo of outdoor space']\n", + "# Filter samples with max_value > 0.9\n", + "threshold_df = df[df['max_value'] > 0.9]\n", + "\n", + "# Split data into train and test sets\n", + "train_df, test_df = train_test_split(threshold_df, test_size=0.2, random_state=42)\n", + "\n", + "# Now use the train_df to prepare your training data\n", + "train_all_json = prepare_data_from_dataframe(train_df, 'train_random.json')\n", + "# And test_df to prepare your testing data\n", + "test_json = prepare_data_from_dataframe(test_df, 'val_random.json')\n", + "\n", + "# The function prepare_data_from_dataframe remains unchanged\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bbbb5b9b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n", + "\n", + "KeyboardInterrupt\n", + "\n" + ] + } + ], + "source": [ + "# test from org ckpt\n", + "import pandas as pd\n", + "from collections import Counter\n", + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "def calculate_category_accuracy(true_labels, predicted_labels):\n", + " \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n", + " accuracies = {}\n", + " true_labels_counter = Counter(true_labels)\n", + " correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n", + " \n", + " for label in true_labels_counter:\n", + " accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n", + " accuracies[label] = accuracy\n", + " \n", + " return accuracies\n", + "\n", + "# Load the JSON data\n", + "with open('val_random.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 128 # Adjust based on your available memory\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "\n", + "\n", + "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + "model = AutoModel.from_pretrained(repo_id)\n", + "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n", + "\n", + "all_predictions = []\n", + "all_true_labels = []\n", + "\n", + "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + "accuracy = correct_predictions / len(all_true_labels)\n", + "print(f\"Accuracy for model org: {accuracy * 100:.2f}%\\n\")\n", + "\n", + "# Calculate the accuracy for each category\n", + "category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n", + "\n", + "# Print the accuracy for each category\n", + "print(\"Accuracy for each category:\")\n", + "for category, accuracy in category_accuracies.items():\n", + " print(f\"{category}: {accuracy:.2f}\")\n", + "\n", + "# Convert the dictionary to a DataFrame\n", + "category_accuracy_df = pd.DataFrame(list(category_accuracies.items()), columns=['Category', 'Accuracy'])\n", + "\n", + "# Save the DataFrame to a CSV file\n", + "category_accuracy_df.to_csv('category_accuracy.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13288c96", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", + "unique_labels = list(set(all_true_labels))\n", + "# Compute the confusion matrix\n", + "cm = confusion_matrix(all_true_labels, all_predictions, labels=unique_labels)\n", + "\n", + "# Display the confusion matrix\n", + "fig, ax = plt.subplots(figsize=(10, 10)) # Adjust the size as needed\n", + "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=unique_labels)\n", + "disp.plot(ax=ax)\n", + "\n", + "# Rotate the x-axis labels to display them vertically\n", + "plt.xticks(rotation=90)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ca9fb408", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train file for this fold: train_fold_0.json\n", + "Test file for this fold: val_fold_0.json\n", + "Train file for this fold: train_fold_1.json\n", + "Test file for this fold: val_fold_1.json\n", + "Train file for this fold: train_fold_2.json\n", + "Test file for this fold: val_fold_2.json\n", + "Train file for this fold: train_fold_3.json\n", + "Test file for this fold: val_fold_3.json\n", + "Train file for this fold: train_fold_4.json\n", + "Test file for this fold: val_fold_4.json\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import KFold\n", + "import json\n", + "\n", + "def prepare_data_from_dataframe(df, output_file):\n", + " \"\"\"Prepare data for fine-tuning by reading from a DataFrame.\"\"\"\n", + " data = []\n", + " # Process data\n", + " for _, row in df.iterrows():\n", + " image_path = f\"../data/{row['max_key']}/{row['path'].split('/')[-1]}\"\n", + " data.append({\"image\": image_path, \"caption\": row['max_key']})\n", + " # Save the data in JSON format\n", + " with open(output_file, \"w\") as f:\n", + " for item in data:\n", + " json.dump(item, f)\n", + " f.write(\"\\n\")\n", + "\n", + "def perform_k_fold(df, n_splits):\n", + " \"\"\"Perform K-fold split and data preparation, including filtering by confidence thresholds.\"\"\"\n", + " kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)\n", + " fold_data_info = []\n", + " fold_counter = 0\n", + "\n", + " for train_index, test_index in kf.split(df):\n", + " train_df, test_df = df.iloc[train_index], df.iloc[test_index]\n", + " test_json = f'val_fold_{fold_counter}.json'\n", + " prepare_data_from_dataframe(test_df, test_json)\n", + " \n", + " # Creating JSON files for each confidence threshold from 0.91 to 0.99\n", + " for threshold in range(90, 100):\n", + " threshold_df = train_df[train_df['max_value'] > threshold / 100.0]\n", + " threshold_train_json = f'train_fold_{fold_counter}_thr_{threshold}.json'\n", + " prepare_data_from_dataframe(threshold_df, threshold_train_json)\n", + " fold_data_info.append((threshold_train_json, test_json))\n", + " \n", + " fold_counter += 1\n", + "\n", + " return fold_data_info\n", + "\n", + "# Load and preprocess the DataFrame\n", + "df = pd.read_csv('labels.csv')\n", + "df = df[df['max_key'] != 'error']\n", + "df = df[df['max_key'] != 'a photo of other indoor space: not kitchen, not bathroom, not living room, not dining room, not foyer']\n", + "df = df[df['max_key'] != 'it is a artificial photo']\n", + "df = df[df['max_key'] != 'a photo of outdoor space']\n", + "df = df[df['max_value'] > 0.9]\n", + "\n", + "# Perform the 5-fold split and data preparation including threshold filtering\n", + "fold_files = perform_k_fold(df, 5)\n", + "\n", + "# Print out the file names for each fold and threshold\n", + "for train_file, test_file in fold_files:\n", + " print(f\"Train file for this fold and threshold: {train_file}\")\n", + " print(f\"Test file for this fold: {test_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ae907a13", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/pipelines/base.py:1123: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model on fold 0: 83.15%\n", + "\n", + "Accuracy for model on fold 1: 84.30%\n", + "\n", + " Fold Category Accuracy\n", + "0 0 a photo of contemporary foyer 0.804124\n", + "1 0 a photo of standard kitchen 1.000000\n", + "2 0 a photo of contemporary bathroom 0.667910\n", + "3 0 a photo of standard foyer 0.868794\n", + "4 0 a photo of standard bathroom 0.994197\n", + "5 0 a photo of contemporary dining room 0.952632\n", + "6 0 a photo of standard living room 0.986486\n", + "7 0 a photo of contemporary living room 0.534050\n", + "8 0 a photo of contemporary kitchen 0.689655\n", + "9 0 a photo of standard dining room 1.000000\n", + "10 0 Overall 0.831540\n", + "11 1 a photo of standard bathroom 0.988539\n", + "12 1 a photo of contemporary dining room 0.957871\n", + "13 1 a photo of contemporary living room 0.597701\n", + "14 1 a photo of standard foyer 0.845896\n", + "15 1 a photo of contemporary kitchen 0.712446\n", + "16 1 a photo of standard kitchen 0.997375\n", + "17 1 a photo of contemporary bathroom 0.725490\n", + "18 1 a photo of contemporary foyer 0.802030\n", + "19 1 a photo of standard living room 0.968153\n", + "20 1 a photo of standard dining room 1.000000\n", + "21 1 Overall 0.843030\n" + ] + } + ], + "source": [ + "# test from org ckpt\n", + "import pandas as pd\n", + "import numpy as np\n", + "from collections import Counter\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "def calculate_category_accuracy(true_labels, predicted_labels):\n", + " \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n", + " accuracies = {}\n", + " true_labels_counter = Counter(true_labels)\n", + " correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n", + " \n", + " for label in true_labels_counter:\n", + " accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n", + " accuracies[label] = accuracy\n", + " \n", + " return accuracies\n", + "\n", + "# Load model components\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + "model = AutoModel.from_pretrained(repo_id)\n", + "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n", + "\n", + "# DataFrame to store all accuracies\n", + "all_accuracies = []\n", + "\n", + "# Loop over each fold and test\n", + "for fold in range(2):\n", + " # Load the JSON data for the current fold\n", + " with open(f'val_fold_{fold}.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + " # Extract image paths and labels for the current fold\n", + " image_paths = [item['image'] for item in data]\n", + " labels = [item['caption'] for item in data]\n", + "\n", + " BATCH_SIZE = 128 # Adjust based on your available memory\n", + " all_predictions = []\n", + " all_true_labels = []\n", + "\n", + " for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + " correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + " accuracy = correct_predictions / len(all_true_labels)\n", + " print(f\"Accuracy for model on fold {fold}: {accuracy * 100:.2f}%\\n\")\n", + "\n", + " # Calculate the accuracy for each category for the current fold\n", + " category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n", + "\n", + " # Store the accuracies in a list of dictionaries\n", + " for category, acc in category_accuracies.items():\n", + " all_accuracies.append({'Fold': fold, 'Category': category, 'Accuracy': acc})\n", + " # Add overall accuracy for the current fold to the list\n", + " all_accuracies.append({'Fold': fold, 'Category': 'Overall', 'Accuracy': accuracy})\n", + "\n", + "# Convert the list of dictionaries to a DataFrame\n", + "all_accuracies_df = pd.DataFrame(all_accuracies)\n", + "\n", + "# Save the DataFrame to a CSV file\n", + "all_accuracies_df.to_csv('./result/org_results.csv', index=False)\n", + "\n", + "# Print out the final DataFrame\n", + "print(all_accuracies_df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "87a5f0e5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.90, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold1.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5fdf3bf8356042f0bae130c32b271de2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fe5b8694dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/16/2024 15:32:44 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fe5b8694dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 18410/18410 [00:00<00:00, 27724.21 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 18185/18185 [00:00<00:00, 19921.85 e\n", + " 0%| | 0/470 [00:00 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold2.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "38c113eb1dfe4d9ebf7b32f3326376e8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f7777490dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/16/2024 16:56:13 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f7777490dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 16981/16981 [00:00<00:00, 27929.24 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 16772/16772 [00:00<00:00, 19787.47 e\n", + " 0%| | 0/430 [00:00 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold3.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bee210fcdc8c42328c06ffefbff9fcbc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fb1052e0dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/16/2024 18:12:10 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fb1052e0dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 15451/15451 [00:00<00:00, 28962.66 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 15258/15258 [00:00<00:00, 21247.67 e\n", + " 0%| | 0/390 [00:00 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold4.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "83665d03d3e64c3abd4cfaa3caa82971", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f7ea3324ee0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/16/2024 19:18:46 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f7ea3324ee0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 13850/13850 [00:00<00:00, 29143.26 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 13679/13679 [00:00<00:00, 21199.23 e\n", + " 0%| | 0/350 [00:00 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold5.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7157deff57a74b6c8b5b6503962ba4e2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f1718ff8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/16/2024 20:18:38 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f1718ff8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 12273/12273 [00:00<00:00, 29537.34 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 12122/12122 [00:00<00:00, 21387.33 e\n", + " 0%| | 0/310 [00:00 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold6.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f0cc8510c99a4235b0d377829345281d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f50600b4dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/16/2024 21:11:52 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f50600b4dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 10547/10547 [00:00<00:00, 29168.34 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 10415/10415 [00:00<00:00, 21648.30 e\n", + " 0%| | 0/270 [00:00 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold7.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1274e9aa864a4e679121fd49f1037bce", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fbd12afcdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/16/2024 21:58:06 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fbd12afcdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 8641/8641 [00:00<00:00, 28738.00 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 8534/8534 [00:00<00:00, 21546.58 exa\n", + " 0%| | 0/220 [00:00 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold8.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0affb46f22bb4a4c82563fd4ed0e7bd1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7facd6864dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/16/2024 22:35:49 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7facd6864dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 6532/6532 [00:00<00:00, 28798.38 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 6445/6445 [00:00<00:00, 21433.02 exa\n", + " 0%| | 0/160 [00:00 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold9.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e23065ebd3f7413798cb3acc2afad5d2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f85485ccdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/16/2024 23:03:24 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f85485ccdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 4144/4144 [00:00<00:00, 27507.68 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 4101/4101 [00:00<00:00, 21094.17 exa\n", + " 0%| | 0/100 [00:00 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold10.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3bc856c2880f42b18306c6b675e2dfed", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fe4dc188dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/16/2024 23:20:55 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fe4dc188dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 1382/1382 [00:00<00:00, 24913.73 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 1366/1366 [00:00<00:00, 19082.86 exa\n", + " 0%| | 0/30 [00:00 0.90, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold1.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8c21d6aaa83b4d5c81f627e540fd2c6e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f4140184dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/16/2024 23:26:17 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f4140184dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 18410/18410 [00:00<00:00, 29787.21 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 18195/18195 [00:00<00:00, 21378.62 e\n", + " 0%| | 0/470 [00:00 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold2.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6f222218592945ec81ec10a459b8911f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fd981c54dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 00:47:08 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fd981c54dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 16909/16909 [00:00<00:00, 29673.49 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 16715/16715 [00:00<00:00, 21582.93 e\n", + " 0%| | 0/430 [00:00 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold3.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "17eb26e0a1a44ecb9b3baed49b93d356", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fb794454dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 02:00:51 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fb794454dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 15411/15411 [00:00<00:00, 29610.96 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 15237/15237 [00:00<00:00, 21476.83 e\n", + " 0%| | 0/390 [00:00 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold4.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "12eef0873845493f8e1df68e0fc76a78", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f75445e8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 03:07:48 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f75445e8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 13823/13823 [00:00<00:00, 29426.80 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 13667/13667 [00:00<00:00, 21484.43 e\n", + " 0%| | 0/350 [00:00 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold5.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e6af79e64d0c4ff093bbbcabc25c028a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7ff0c1b6cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 04:08:05 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7ff0c1b6cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 12236/12236 [00:00<00:00, 29435.37 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 12097/12097 [00:00<00:00, 21528.46 e\n", + " 0%| | 0/310 [00:00 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold6.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b4a604a4942741968bd5eba3ba4491a6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fab9bd64dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 05:01:26 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fab9bd64dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 10506/10506 [00:00<00:00, 29337.22 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 10385/10385 [00:00<00:00, 21456.00 e\n", + " 0%| | 0/270 [00:00 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold7.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0eae5263eb204eb48b0680faf043a3ad", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f6b9a964dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 05:47:56 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f6b9a964dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 8620/8620 [00:00<00:00, 29092.75 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 8522/8522 [00:00<00:00, 21526.17 exa\n", + " 0%| | 0/220 [00:00 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold8.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7b341f15b4ba4bd693267d935409342c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fa42c540dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 06:25:54 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fa42c540dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 6483/6483 [00:00<00:00, 28558.51 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 6407/6407 [00:00<00:00, 21230.77 exa\n", + " 0%| | 0/160 [00:00 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold9.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9791e95e1cac4b96a2780146c771a15b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f334c220dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 06:53:36 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f334c220dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 4130/4130 [00:00<00:00, 27951.33 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 4098/4098 [00:00<00:00, 21248.01 exa\n", + " 0%| | 0/100 [00:00 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold1_threshold10.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "800bcb4797c6476ab149f9a6d7b0ac9a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fc16ff28dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 07:11:08 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fc16ff28dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 1366/1366 [00:00<00:00, 24988.85 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 1357/1357 [00:00<00:00, 20474.15 exa\n", + " 0%| | 0/30 [00:00 0.90, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold1.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "90681f289d6d4fed89378db98f216ecc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fdbb9f04dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 07:16:32 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fdbb9f04dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 18410/18410 [00:00<00:00, 29189.51 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 18200/18200 [00:00<00:00, 21406.86 e\n", + " 0%| | 0/470 [00:00 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold2.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "aaf9a0edbf214eeb86c4542ace684120", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f6608794dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 08:37:10 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f6608794dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 16939/16939 [00:00<00:00, 29678.72 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 16749/16749 [00:00<00:00, 21520.55 e\n", + " 0%| | 0/430 [00:00 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold3.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b5fb3e24d752404ab6285054e9599642", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f5408ec8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 09:50:42 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f5408ec8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 15439/15439 [00:00<00:00, 28950.30 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 15263/15263 [00:00<00:00, 21450.38 e\n", + " 0%| | 0/390 [00:00 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold4.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "84d9f68808ee4443b5276d9dd47eadd0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fb739f48dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 10:58:07 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fb739f48dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 13827/13827 [00:00<00:00, 28941.77 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 13671/13671 [00:00<00:00, 21331.84 e\n", + " 0%| | 0/350 [00:00 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold5.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "84349bd58f7d4d7fa601966cf7d860bb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f30bc114dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 12:02:38 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f30bc114dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 12224/12224 [00:00<00:00, 28843.71 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 12087/12087 [00:00<00:00, 20386.48 e\n", + " 0%| | 0/310 [00:00 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold6.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a16022848e524c01bbd1745df4ea9ffb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f0a80500dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 12:57:05 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f0a80500dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 10488/10488 [00:00<00:00, 28834.60 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 10369/10369 [00:00<00:00, 21240.58 e\n", + " 0%| | 0/270 [00:00 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold7.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a3ad1ffc64204f17af1b1161696b5f06", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f0ca4de8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 13:45:59 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f0ca4de8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 8586/8586 [00:00<00:00, 27862.04 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 8492/8492 [00:00<00:00, 20458.35 exa\n", + " 0%| | 0/220 [00:00 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold8.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "674cde2bd4314d26b4de3866f7a9e791", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f7763730dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 14:26:02 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f7763730dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 6497/6497 [00:00<00:00, 27474.72 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 6425/6425 [00:00<00:00, 19865.38 exa\n", + " 0%| | 0/160 [00:00 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold9.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "217c9582f07a430b996e7c682c0c2427", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f8c8f214dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 14:55:34 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f8c8f214dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 4151/4151 [00:00<00:00, 25735.84 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 4118/4118 [00:00<00:00, 18134.09 exa\n", + " 0%| | 0/100 [00:00 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold2_threshold10.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "df243dbdb303470cbbf92dc21b89a011", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fc02cd34dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 15:14:00 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fc02cd34dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 1357/1357 [00:00<00:00, 25011.41 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 1346/1346 [00:00<00:00, 19520.80 exa\n", + " 0%| | 0/30 [00:00 0.90, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold1.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a92feebeb8984168a4d7bfd1308c7a1c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f2db0ff8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 15:19:31 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f2db0ff8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 18411/18411 [00:00<00:00, 27826.86 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 18189/18189 [00:00<00:00, 20605.62 e\n", + " 0%| | 0/470 [00:00 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold2.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "54089abc4acd42159cc4fd0ff6a2e7da", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fe3000d4dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 16:44:10 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fe3000d4dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 16936/16936 [00:00<00:00, 28639.94 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 16736/16736 [00:00<00:00, 20420.36 e\n", + " 0%| | 0/430 [00:00 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold3.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a0b05d1fb6a54fd3ae19087a28b4570c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fc20a414dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 18:01:35 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fc20a414dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 15422/15422 [00:00<00:00, 29274.55 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 15240/15240 [00:00<00:00, 20987.70 e\n", + " 0%| | 0/390 [00:00 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold4.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "96b0744bcdb94f0c95b2dd38720b8a2e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f04b3e38dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 19:09:35 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f04b3e38dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 13849/13849 [00:00<00:00, 28925.33 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 13688/13688 [00:00<00:00, 21104.87 e\n", + " 0%| | 0/350 [00:00 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold5.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b8edfee587dc4346a6172beb21b42507", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f48437e0dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 20:10:34 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f48437e0dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 12276/12276 [00:00<00:00, 29087.75 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 12136/12136 [00:00<00:00, 21124.55 e\n", + " 0%| | 0/310 [00:00 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold6.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b554835e6cf04eb9b0ba600032a97241", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fe8ad57cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 21:04:48 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fe8ad57cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████��█████| 10542/10542 [00:00<00:00, 28562.65 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 10420/10420 [00:00<00:00, 21241.45 e\n", + " 0%| | 0/270 [00:00 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold7.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0d313401328441408ea1138c9c20a9fe", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f4e1b01cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 21:51:49 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f4e1b01cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 8644/8644 [00:00<00:00, 28711.30 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 8546/8546 [00:00<00:00, 21136.89 exa\n", + " 0%| | 0/220 [00:00 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold8.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "07e4b9e62ab74cd9840b19cc4366f7f5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f539c220dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 22:30:16 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f539c220dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 6509/6509 [00:00<00:00, 28510.06 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 6434/6434 [00:00<00:00, 21116.37 exa\n", + " 0%| | 0/160 [00:00 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold9.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b00d47a587bb4172ae9c6a136227ebfd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f1af9e20dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 22:58:19 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f1af9e20dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 4145/4145 [00:00<00:00, 27460.08 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 4104/4104 [00:00<00:00, 21182.78 exa\n", + " 0%| | 0/100 [00:00 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold3_threshold10.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "39d2705fe798440b8a5ef596a2fa15b2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fe0d14bcdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 23:16:10 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fe0d14bcdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 1384/1384 [00:00<00:00, 25025.29 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 1369/1369 [00:00<00:00, 20654.61 exa\n", + " 0%| | 0/30 [00:00 0.90, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold1.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6f4a151362d146e08df5fd3919da8118", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fb4d2cf8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/17/2024 23:21:38 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fb4d2cf8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 18411/18411 [00:00<00:00, 28710.80 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 18191/18191 [00:00<00:00, 21283.84 e\n", + " 0%| | 0/470 [00:00 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold2.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f92e4eb759394314bd8620b78191f7cf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7faac3c70dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/18/2024 00:43:53 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7faac3c70dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 16919/16919 [00:00<00:00, 29294.57 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 16720/16720 [00:00<00:00, 21354.41 e\n", + " 0%| | 0/430 [00:00 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold3.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a7cd324457d4430da0fa7caf432058ff", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f9801398dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/18/2024 01:59:04 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f9801398dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 15445/15445 [00:00<00:00, 29273.29 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 15266/15266 [00:00<00:00, 21215.94 e\n", + " 0%| | 0/390 [00:00 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold4.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c2ca7462c8b84596af5273905bf5ec4a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fcd6064cee0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/18/2024 03:07:15 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fcd6064cee0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 13827/13827 [00:00<00:00, 29146.08 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 13667/13667 [00:00<00:00, 21224.77 e\n", + " 0%| | 0/350 [00:00 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold5.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7b03dabaa81e4ca89266b36908751a4c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f5372268dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/18/2024 04:09:02 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f5372268dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 12231/12231 [00:00<00:00, 28084.34 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 12086/12086 [00:00<00:00, 20686.81 e\n", + " 0%| | 0/310 [00:00 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold6.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8f100fadedbf414e90eefbe02355fa8a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f1fb9b9cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/18/2024 05:03:53 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f1fb9b9cdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 10517/10517 [00:00<00:00, 27957.56 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 10391/10391 [00:00<00:00, 20666.83 e\n", + " 0%| | 0/270 [00:00 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold7.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d1f172fb13304819985fc1fbf1d89878", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f9227460dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/18/2024 05:51:41 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f9227460dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 8637/8637 [00:00<00:00, 28058.27 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 8538/8538 [00:00<00:00, 20734.63 exa\n", + " 0%| | 0/220 [00:00 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold8.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f2b6cef4dcc84103ba104d3af2cb5c0b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f2a408b8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/18/2024 06:30:52 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f2a408b8dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 6555/6555 [00:00<00:00, 28287.52 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 6477/6477 [00:00<00:00, 20603.65 exa\n", + " 0%| | 0/160 [00:00 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold9.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2fbcb2ea946c4d1999c8864a215233de", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f1ebcbacdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/18/2024 06:59:25 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f1ebcbacdc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 4218/4218 [00:00<00:00, 27364.73 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 4179/4179 [00:00<00:00, 20736.71 exa\n", + " 0%| | 0/100 [00:00 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold4_threshold10.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "65b69f5895424364916dc0b17d34bebf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fc7c1f04dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "04/18/2024 07:17:26 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fc7c1f04dc0> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 1435/1435 [00:00<00:00, 25075.73 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 1422/1422 [00:00<00:00, 20741.93 exa\n", + " 0%| | 0/30 [00:00 {threshold:.2f}, saving output to {output_folder}.\")\n", + "\n", + " # Load dataset\n", + " data_files = {'train': train_json, 'validation': test_json}\n", + " dataset = load_dataset(\"json\", data_files=data_files)\n", + " print(f\"First image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'\")\n", + "\n", + " !python huggingface_finetune_clip.py \\\n", + " --output_dir {output_folder} --model_name_or_path {repo_id} \\\n", + " --train_file {train_json} \\\n", + " --validation_file {test_json} \\\n", + " --image_column image \\\n", + " --overwrite_output_dir=True \\\n", + " --max_seq_length=77 \\\n", + " --num_train_epochs=10 \\\n", + " --save_total_limit=5 \\\n", + " --caption_column caption \\\n", + " --remove_unused_columns=False \\\n", + " --do_train \\\n", + " --logging_strategy=\"epoch\"\\\n", + " --per_device_train_batch_size=128 \\\n", + " --dataloader_drop_last=True\\\n", + " --learning_rate=\"1e-6\" --warmup_steps=\"0\" --weight_decay 0.1 \n", + " print(f\"--\\nDONE. If it worked, trained data should be in {output_folder}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "511f7306", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold1...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/pipelines/base.py:1123: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model on fold 0 with threshold 0.91: 96.96%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold2...\n", + "Accuracy for model on fold 0 with threshold 0.92: 97.10%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold3...\n", + "Accuracy for model on fold 0 with threshold 0.93: 96.72%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold4...\n", + "Accuracy for model on fold 0 with threshold 0.94: 96.40%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold5...\n", + "Accuracy for model on fold 0 with threshold 0.95: 96.40%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold6...\n", + "Accuracy for model on fold 0 with threshold 0.96: 95.84%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold7...\n", + "Accuracy for model on fold 0 with threshold 0.97: 95.79%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold8...\n", + "Accuracy for model on fold 0 with threshold 0.98: 94.62%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold9...\n", + "Accuracy for model on fold 0 with threshold 0.99: 93.12%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold1...\n", + "Accuracy for model on fold 1 with threshold 0.91: 97.08%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold2...\n", + "Accuracy for model on fold 1 with threshold 0.92: 97.17%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold3...\n", + "Accuracy for model on fold 1 with threshold 0.93: 98.20%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold4...\n", + "Accuracy for model on fold 1 with threshold 0.94: 96.79%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold5...\n", + "Accuracy for model on fold 1 with threshold 0.95: 96.75%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold6...\n", + "Accuracy for model on fold 1 with threshold 0.96: 96.33%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold7...\n", + "Accuracy for model on fold 1 with threshold 0.97: 95.81%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold8...\n", + "Accuracy for model on fold 1 with threshold 0.98: 94.56%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold9...\n", + "Accuracy for model on fold 1 with threshold 0.99: 92.93%\n", + "\n", + "--\n", + "DONE\n", + "\n", + "All results saved to model_evaluation_results.csv\n", + "All detailed prediction results saved to detailed_predictions.csv\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import matplotlib.pyplot as plt\n", + "from collections import Counter\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "def calculate_category_accuracy(true_labels, predicted_labels):\n", + " \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n", + " accuracies = {}\n", + " true_labels_counter = Counter(true_labels)\n", + " correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n", + " \n", + " for label in true_labels_counter:\n", + " accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n", + " accuracies[label] = accuracy\n", + " \n", + " return accuracies\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "BATCH_SIZE = 128 # Adjust based on your available memory\n", + "results = [] # List to hold accuracy results\n", + "\n", + "# Loop for each fold and each threshold\n", + "for fold in range(2):\n", + " for idx, threshold in enumerate(np.arange(0.91, 1.00, 0.01)):\n", + " model_dir = f\"./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold{fold}_threshold{idx+1}\"\n", + " test_json = f'val_fold_{fold}.json' # Test JSON file for the current fold\n", + " print(f\"Evaluating with model from {model_dir}...\")\n", + "\n", + " # Load the JSON data for testing\n", + " with open(test_json, 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + " # Extract image paths and labels\n", + " image_paths = [item['image'] for item in data]\n", + " labels = [item['caption'] for item in data]\n", + "\n", + " # Initialize model components\n", + " image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + " tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + " model = AutoModel.from_pretrained(model_dir)\n", + " clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(model_dir))\n", + "\n", + " all_predictions = []\n", + " all_true_labels = []\n", + "\n", + " # Process images in batches\n", + " for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + "\n", + " # Calculate accuracy\n", + " correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + " accuracy = correct_predictions / len(all_true_labels)\n", + " print(f\"Accuracy for model on fold {fold} with threshold {threshold:.2f}: {accuracy * 100:.2f}%\\n\")\n", + "\n", + " # Calculate the accuracy for each category\n", + " category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n", + " for category, acc in category_accuracies.items():\n", + " results.append({\n", + " 'Fold': fold,\n", + " 'Threshold': f\">{threshold:.2f}\",\n", + " 'Category': category,\n", + " 'Accuracy': acc\n", + " })\n", + " print(\"--\\nDONE\\n\")\n", + "\n", + "# Create DataFrame from results and save to CSV\n", + "results_df = pd.DataFrame(results)\n", + "results_df.to_csv('./result/ours_results.csv', index=False)\n", + "print(\"All results saved to model_evaluation_results.csv\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "17b5b940", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain1...\n", + "Accuracy for model 1: 97.08%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain2...\n", + "Accuracy for model 2: 97.17%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain3...\n", + "Accuracy for model 3: 98.20%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain4...\n", + "Accuracy for model 4: 96.79%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain5...\n", + "Accuracy for model 5: 96.75%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain6...\n", + "Accuracy for model 6: 96.33%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain7...\n", + "Accuracy for model 7: 95.81%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain8...\n", + "Accuracy for model 8: 94.56%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain9...\n", + "Accuracy for model 9: 92.93%\n", + "\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import matplotlib.pyplot as plt\n", + "from collections import Counter\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "def calculate_category_accuracy(true_labels, predicted_labels):\n", + " \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n", + " accuracies = {}\n", + " true_labels_counter = Counter(true_labels)\n", + " correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n", + " \n", + " for label in true_labels_counter:\n", + " accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n", + " accuracies[label] = accuracy\n", + " \n", + " return accuracies\n", + "\n", + "BATCH_SIZE = 128 # Adjust based on your available memory\n", + "results = []\n", + "\n", + "# Loop for each fold and each threshold\n", + "for fold in range(5):\n", + " for threshold in np.arange(0.90, 1.00, 0.01):\n", + " model_dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_fold{fold}_threshold{int(threshold*100)}\"\n", + " test_json = f'val_fold_{fold}.json' # Test JSON file for the current fold\n", + " print(f\"Evaluating with model from {model_dir}...\")\n", + "\n", + " # Load the JSON data for testing\n", + " with open(test_json, 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + " # Extract image paths and labels\n", + " image_paths = [item['image'] for item in data]\n", + " labels = [item['caption'] for item in data]\n", + "\n", + " # Initialize model components\n", + " image_processor = AutoImageProcessor.from_pretrained(model_dir)\n", + " tokenizer = AutoTokenizer.from_pretrained(model_dir, config=AutoConfig.from_pretrained(model_dir))\n", + " model = AutoModel.from_pretrained(model_dir)\n", + " clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(model_dir))\n", + "\n", + " all_predictions = []\n", + " all_true_labels = []\n", + "\n", + " # Process images in batches\n", + " for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + " # Calculate accuracy\n", + " correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + " accuracy = correct_predictions / len(all_true_labels)\n", + " print(f\"Accuracy for model on fold {fold} with threshold {threshold:.2f}: {accuracy * 100:.2f}%\\n\")\n", + "\n", + " # Calculate the accuracy for each category\n", + " category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n", + " for category, acc in category_accuracies.items():\n", + " results.append({\n", + " 'Fold': fold,\n", + " 'Threshold': f\">{threshold:.2f}\",\n", + " 'Category': category,\n", + " 'Accuracy': acc\n", + " })\n", + " print(\"--\\nDONE\\n\")\n", + "\n", + "# Create DataFrame from results and save to CSV\n", + "results_df = pd.DataFrame(results)\n", + "results_df.to_csv('./result/ours_results.csv', index=False)\n", + "print(\"All results saved to model_evaluation_results.csv\")\n", + "\n", + "# change to org test and add all predict details into df, future may generate con-mat" + ] + }, + { + "cell_type": "markdown", + "id": "ff6b001a", + "metadata": {}, + "source": [ + "# read data directly from csv 0.95 as test" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a72455d1", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import json\n", + "\n", + "def prepare_data_from_dataframe(df: pd.DataFrame, output_file: str) -> str:\n", + " \"\"\"Prepare data for finetuning by reading from a DataFrame.\"\"\"\n", + " data = []\n", + "\n", + " # Process data\n", + " for _, row in df.iterrows():\n", + " image_path = f\"../data/{row['max_key']}/{row['path'].split('/')[-1]}\"\n", + " data.append({\"image\": image_path, \"caption\": row['max_key']})\n", + "\n", + " # Save the data in JSON format\n", + " with open(output_file, \"w\") as f:\n", + " for item in data:\n", + " json.dump(item, f)\n", + " f.write(\"\\n\")\n", + "\n", + " return output_file\n", + "\n", + "\n", + "# Load the CSV file\n", + "df = pd.read_csv('labels.csv')\n", + "df = df[df['max_key'] != 'error']\n", + "df = df[df['max_key'] != 'a photo of other indoor space: not kitchen, not bathroom, not living room, not dining room, not foyer']\n", + "df = df[df['max_key'] != 'it is a artificial photo']\n", + "df = df[df['max_key'] != 'a photo of outdoor space']\n", + "\n", + "# Filter samples with max_value > 0.9\n", + "test_df = df[df['max_value'] == 0.95]\n", + "threshold_df = df[(df['max_value'] > 0.9) & (~df.index.isin(test_df.index))]\n", + "\n", + "\n", + "test_json = prepare_data_from_dataframe(test_df, 'val.json')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "5102624a", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "<>:11: SyntaxWarning: \"is\" with a literal. Did you mean \"==\"?\n", + "<>:11: SyntaxWarning: \"is\" with a literal. Did you mean \"==\"?\n", + "/tmp/ipykernel_3654472/3070016425.py:11: SyntaxWarning: \"is\" with a literal. Did you mean \"==\"?\n", + " if threshold is 0.95:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for threshold > 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3e501fe53c074a57b1aa32c25bdaf992", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7feef8dc4d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "03/22/2024 11:54:49 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7feef8dc4d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 19011/19011 [00:00<00:00, 28447.70 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 18786/18786 [00:00<00:00, 20427.73 e\n", + " 0%| | 0/480 [00:00 0.92, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "79dc32649f2546aabe5e2aff055467fa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f6f17050d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "03/22/2024 13:19:31 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f6f17050d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 17132/17132 [00:00<00:00, 28605.44 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 16929/16929 [00:00<00:00, 21019.66 e\n", + " 0%| | 0/440 [00:00 0.93, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a644eaf4ab2d404d9868baa49668d2d6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f245c0a4d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "03/22/2024 14:36:54 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f245c0a4d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 15134/15134 [00:00<00:00, 28682.00 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 14956/14956 [00:00<00:00, 20926.65 e\n", + " 0%| | 0/380 [00:00 0.94, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a4d09a8ec1c04a0fb7c96183298b3033", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fe886510d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "03/22/2024 15:43:50 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fe886510d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 13150/13150 [00:00<00:00, 28607.09 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 12995/12995 [00:00<00:00, 20773.55 e\n", + " 0%| | 0/330 [00:00 0.95, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2681834cdcfc415583a4608cc9137d36", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f11b826cd30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "03/22/2024 16:42:25 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f11b826cd30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 13150/13150 [00:00<00:00, 28708.52 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 12995/12995 [00:00<00:00, 20854.43 e\n", + " 0%| | 0/330 [00:00 0.96, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "34e2973f4e48490da57038a3fa3fe5eb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fb0cc32cd30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "03/22/2024 17:41:58 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fb0cc32cd30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|███████████████████| 10782/10782 [00:00<00:00, 28008.46 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 10658/10658 [00:00<00:00, 20642.58 e\n", + " 0%| | 0/270 [00:00 0.97, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0fa31178a7ea4a37a0a2ac2b95259224", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7fb3af730d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "03/22/2024 18:30:28 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7fb3af730d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 8144/8144 [00:00<00:00, 27990.81 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 8047/8047 [00:00<00:00, 20525.77 exa\n", + " 0%| | 0/200 [00:00 0.98, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ad633b6ee9284399a1cf4e0388677a51", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f3a009b0d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "03/22/2024 19:06:43 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f3a009b0d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 5197/5197 [00:00<00:00, 27343.36 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 5150/5150 [00:00<00:00, 20052.73 exa\n", + " 0%| | 0/130 [00:00 0.99, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain9.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1227eb23eba3473daa795fad84ad48d5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00.filter_corrupt_images at 0x7f0abc714d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "03/22/2024 19:30:26 - WARNING - datasets.fingerprint - Parameter 'function'=.filter_corrupt_images at 0x7f0abc714d30> of the transform datasets.arrow_dataset.Dataset.filter@2.0.1 couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", + "Filter: 100%|█████████████████████| 1731/1731 [00:00<00:00, 25087.04 examples/s]\n", + "Running tokenizer on train dataset: 100%|█| 1715/1715 [00:00<00:00, 20083.96 exa\n", + " 0%| | 0/40 [00:00 threshold]\n", + " threshold_df = threshold_df[threshold_df['max_value'] != 0.95]\n", + " train_json = f'train{i}.json'\n", + " prepare_data_from_dataframe(threshold_df, train_json)\n", + "\n", + " output_folder = f\"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}\"\n", + " print(f\"Finetuning {repo_id} for threshold > {threshold:.2f}, saving output to {output_folder}.\")\n", + " data_files = {'train': train_json, 'validation': test_json}\n", + " dataset = load_dataset(\"json\", data_files=data_files)\n", + " print(f\"first image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'\")\n", + "\n", + " !python huggingface_finetune_clip.py \\\n", + " --output_dir {output_folder} --model_name_or_path {repo_id} \\\n", + " --train_file {train_json} \\\n", + " --validation_file {test_json} \\\n", + " --image_column image \\\n", + " --overwrite_output_dir=True \\\n", + " --max_seq_length=77 \\\n", + " --num_train_epochs=10 \\\n", + " --save_total_limit=5 \\\n", + " --caption_column caption \\\n", + " --remove_unused_columns=False \\\n", + " --do_train \\\n", + " --logging_strategy=\"epoch\"\\\n", + " --per_device_train_batch_size=128 \\\n", + " --dataloader_drop_last=True\\\n", + " --learning_rate=\"1e-6\" --warmup_steps=\"0\" --weight_decay 0.1 \n", + " print(f\"--\\nDONE. If it worked, trained data should be in {output_folder}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "68fd9c3a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/haojin/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/pipelines/base.py:1123: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model 1: 96.96%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2...\n", + "Accuracy for model 2: 97.10%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3...\n", + "Accuracy for model 3: 96.72%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4...\n", + "Accuracy for model 4: 96.40%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5...\n", + "Accuracy for model 5: 96.40%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6...\n", + "Accuracy for model 6: 95.84%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7...\n", + "Accuracy for model 7: 95.79%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8...\n", + "Accuracy for model 8: 94.62%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain9...\n", + "Accuracy for model 9: 93.12%\n", + "\n" + ] + } + ], + "source": [ + "# 95\n", + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "# Load the JSON data\n", + "with open('val.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 128 # Adjust based on your available memory\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "\n", + "# Loop for different finetuned models\n", + "for i in range(1, 10): # Assuming you have 9 finetuned models\n", + " dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}\"\n", + " print(f\"Evaluating with model from {dir}...\")\n", + "\n", + " image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + " tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + " model = AutoModel.from_pretrained(dir)\n", + " clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))\n", + "\n", + " all_predictions = []\n", + " all_true_labels = []\n", + "\n", + " for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + " correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + " accuracy = correct_predictions / len(all_true_labels)\n", + " print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "a363651a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model 9: 83.15%\n", + "\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "# Load the JSON data\n", + "with open('val.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 128 # Adjust based on your available memory\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "\n", + "\n", + "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + "model = AutoModel.from_pretrained(repo_id)\n", + "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n", + "\n", + "all_predictions = []\n", + "all_true_labels = []\n", + "\n", + "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + "accuracy = correct_predictions / len(all_true_labels)\n", + "print(f\"Accuracy for model org: {accuracy * 100:.2f}%\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "888c35be", + "metadata": {}, + "source": [ + "# test on five room task" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "5c21aea1", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ee89e02ab2b64a0d9b80920bd80c9d89", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00 dict:\n", + " \"\"\"Collect images from a specified directory and group them by label.\"\"\"\n", + " images_per_label = defaultdict(list)\n", + " \n", + " subfolders = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]\n", + " for subfolder in subfolders:\n", + " label = subfolder\n", + " subfolder_path = os.path.join(directory, subfolder)\n", + " for filename in os.listdir(subfolder_path):\n", + " if filename.endswith('.jpg') or filename.endswith('.png'):\n", + " images_per_label[label].append(os.path.join(subfolder_path, filename))\n", + " \n", + " return images_per_label\n", + "\n", + "def prepare_data_for_finetuning(train_dir: str, test_dir: str) -> tuple:\n", + " \"\"\"Prepare data for finetuning by reading images from specified train and test directories.\"\"\"\n", + " train_data = []\n", + " val_data = []\n", + "\n", + " # Collect training images\n", + " train_images_per_label = collect_images_from_directory(train_dir)\n", + " for label, images in train_images_per_label.items():\n", + " train_data.extend([{\"image\": img, \"caption\": label} for img in images])\n", + "\n", + " # Collect testing images\n", + " test_images_per_label = collect_images_from_directory(test_dir)\n", + " for label, images in test_images_per_label.items():\n", + " val_data.extend([{\"image\": img, \"caption\": label} for img in images])\n", + "\n", + " # Save the data in JSON format in the code directory\n", + " train_file = \"room_train.json\"\n", + " val_file = \"room_val.json\"\n", + " with open(train_file, \"w\") as f:\n", + " for item in train_data:\n", + " json.dump(item, f)\n", + " f.write(\"\\n\")\n", + " with open(val_file, \"w\") as f:\n", + " for item in val_data:\n", + " json.dump(item, f)\n", + " f.write(\"\\n\")\n", + " \n", + " return train_file, val_file\n", + "\n", + "# Usage:\n", + "train_json, test_json = prepare_data_for_finetuning(\"../room_5\", \"../room_5\")\n", + "data_files = {'train': train_json, 'validation': test_json}\n", + "\n", + "# test loading it back in\n", + "\n", + "dataset = load_dataset(\"json\", data_files=data_files)\n", + "print(f\"first image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'\")\n", + "\n", + "print(sys.executable)\n", + "!which pip3\n", + "dataset['validation']" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "e5ce841f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model: 99.97%\n", + "\n" + ] + } + ], + "source": [ + "# test on original ckpt\n", + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "# Load the JSON data\n", + "with open('room_val.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 128 # Adjust based on your available memory\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "\n", + "\n", + "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + "model = AutoModel.from_pretrained(repo_id)\n", + "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n", + "\n", + "all_predictions = []\n", + "all_true_labels = []\n", + "\n", + "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + "accuracy = correct_predictions / len(all_true_labels)\n", + "print(f\"Accuracy for model: {accuracy * 100:.2f}%\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "fb17bda0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1...\n", + "Accuracy for model 1: 99.96%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2...\n", + "Accuracy for model 2: 99.96%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3...\n", + "Accuracy for model 3: 99.96%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4...\n", + "Accuracy for model 4: 99.96%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5...\n", + "Accuracy for model 5: 99.96%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6...\n", + "Accuracy for model 6: 99.96%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7...\n", + "Accuracy for model 7: 99.97%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8...\n", + "Accuracy for model 8: 99.97%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain9...\n", + "Accuracy for model 9: 99.97%\n", + "\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "# Load the JSON data\n", + "with open('room_val.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 128 # Adjust based on your available memory\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "\n", + "# Loop for different finetuned models\n", + "for i in range(1, 10): # Assuming you have 9 finetuned models\n", + " dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}\"\n", + " print(f\"Evaluating with model from {dir}...\")\n", + "\n", + " image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + " tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + " model = AutoModel.from_pretrained(dir)\n", + " clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))\n", + "\n", + " all_predictions = []\n", + " all_true_labels = []\n", + "\n", + " for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + " correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + " accuracy = correct_predictions / len(all_true_labels)\n", + " print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "5952751e", + "metadata": {}, + "source": [ + "# test on orginal dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0e4e3088", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "83c174061bcb48929db0ce2f479221a4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data files: 0%| | 0/2 [00:00 dict:\n", + " \"\"\"Collect images from a specified directory and group them by label.\"\"\"\n", + " images_per_label = defaultdict(list)\n", + " \n", + " subfolders = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]\n", + " for subfolder in subfolders:\n", + " label = subfolder\n", + " subfolder_path = os.path.join(directory, subfolder)\n", + " for filename in os.listdir(subfolder_path):\n", + " if filename.endswith('.jpg') or filename.endswith('.png'):\n", + " images_per_label[label].append(os.path.join(subfolder_path, filename))\n", + " \n", + " return images_per_label\n", + "\n", + "def prepare_data_for_finetuning(train_dir: str, test_dir: str) -> tuple:\n", + " \"\"\"Prepare data for finetuning by reading images from specified train and test directories.\"\"\"\n", + " train_data = []\n", + " val_data = []\n", + "\n", + " # Collect training images\n", + " train_images_per_label = collect_images_from_directory(train_dir)\n", + " for label, images in train_images_per_label.items():\n", + " train_data.extend([{\"image\": img, \"caption\": label} for img in images])\n", + "\n", + " # Collect testing images\n", + " test_images_per_label = collect_images_from_directory(test_dir)\n", + " for label, images in test_images_per_label.items():\n", + " val_data.extend([{\"image\": img, \"caption\": label} for img in images])\n", + "\n", + " # Save the data in JSON format in the code directory\n", + " train_file = \"kb_train.json\"\n", + " val_file = \"kb_val.json\"\n", + " with open(train_file, \"w\") as f:\n", + " for item in train_data:\n", + " json.dump(item, f)\n", + " f.write(\"\\n\")\n", + " with open(val_file, \"w\") as f:\n", + " for item in val_data:\n", + " json.dump(item, f)\n", + " f.write(\"\\n\")\n", + " \n", + " return train_file, val_file\n", + "\n", + "# Usage:\n", + "train_json, test_json = prepare_data_for_finetuning(\"../class_4\", \"../class_4\")\n", + "data_files = {'train': train_json, 'validation': test_json}\n", + "\n", + "# test loading it back in\n", + "\n", + "dataset = load_dataset(\"json\", data_files=data_files)\n", + "print(f\"first image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'\")\n", + "\n", + "print(sys.executable)\n", + "!which pip3\n", + "dataset['validation']" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "115d5794", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model: 99.87%\n", + "\n" + ] + } + ], + "source": [ + "# test on original ckpt\n", + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "# Load the JSON data\n", + "with open('kb_val.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 128 # Adjust based on your available memory\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "\n", + "\n", + "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + "model = AutoModel.from_pretrained(repo_id)\n", + "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n", + "\n", + "all_predictions = []\n", + "all_true_labels = []\n", + "\n", + "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + "accuracy = correct_predictions / len(all_true_labels)\n", + "print(f\"Accuracy for model: {accuracy * 100:.2f}%\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3050a6d1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for class 'a photo of standard bathroom': 100.00%\n", + "Accuracy for class 'a photo of standard kitchen': 100.00%\n", + "Accuracy for class 'a photo of contemporary bathroom': 99.15%\n", + "Accuracy for class 'a photo of contemporary kitchen': 100.00%\n", + "\n", + "Mean accuracy over all classes: 99.79%\n", + "\n" + ] + } + ], + "source": [ + "# accuracy per class\n", + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "# Load the JSON data\n", + "with open('kb_val.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 128 # Adjust based on your available memory\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "\n", + "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + "model = AutoModel.from_pretrained(repo_id)\n", + "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n", + "\n", + "all_predictions = []\n", + "all_true_labels = []\n", + "\n", + "class_accuracy = {} # Dictionary to track accuracy per class\n", + "\n", + "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + " # Update class accuracy counts\n", + " for true_label, predicted_label in zip(valid_labels, predicted_labels):\n", + " if true_label not in class_accuracy:\n", + " class_accuracy[true_label] = {'correct': 0, 'total': 0}\n", + " class_accuracy[true_label]['total'] += 1\n", + " if true_label == predicted_label:\n", + " class_accuracy[true_label]['correct'] += 1\n", + "\n", + "# Print accuracy per class and calculate mean accuracy\n", + "mean_accuracy = 0\n", + "for class_label, counts in class_accuracy.items():\n", + " class_acc = counts['correct'] / counts['total']\n", + " mean_accuracy += class_acc\n", + " print(f\"Accuracy for class '{class_label}': {class_acc * 100:.2f}%\")\n", + "mean_accuracy /= len(class_accuracy)\n", + "print(f\"\\nMean accuracy over all classes: {mean_accuracy * 100:.2f}%\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "96073841", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for class 'a photo of standard bathroom': 100.00%\n", + "Accuracy for class 'a photo of standard kitchen': 100.00%\n", + "Accuracy for class 'a photo of contemporary bathroom': 99.15%\n", + "Accuracy for class 'a photo of contemporary kitchen': 100.00%\n", + "\n", + "Mean accuracy over all classes: 99.79%\n", + "\n" + ] + } + ], + "source": [ + "# accuracy per class\n", + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "# Load the JSON data\n", + "with open('kb_val.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 128 # Adjust based on your available memory\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "dir = \"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2\"\n", + "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + "model = AutoModel.from_pretrained(dir)\n", + "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))\n", + "\n", + "all_predictions = []\n", + "all_true_labels = []\n", + "\n", + "class_accuracy = {} # Dictionary to track accuracy per class\n", + "\n", + "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + " # Update class accuracy counts\n", + " for true_label, predicted_label in zip(valid_labels, predicted_labels):\n", + " if true_label not in class_accuracy:\n", + " class_accuracy[true_label] = {'correct': 0, 'total': 0}\n", + " class_accuracy[true_label]['total'] += 1\n", + " if true_label == predicted_label:\n", + " class_accuracy[true_label]['correct'] += 1\n", + "\n", + "# Print accuracy per class and calculate mean accuracy\n", + "mean_accuracy = 0\n", + "for class_label, counts in class_accuracy.items():\n", + " class_acc = counts['correct'] / counts['total']\n", + " mean_accuracy += class_acc\n", + " print(f\"Accuracy for class '{class_label}': {class_acc * 100:.2f}%\")\n", + "mean_accuracy /= len(class_accuracy)\n", + "print(f\"\\nMean accuracy over all classes: {mean_accuracy * 100:.2f}%\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a1e34089", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model 1: 99.73%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model 2: 99.87%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model 3: 99.87%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model 4: 99.87%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model 5: 99.87%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model 6: 99.87%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model 7: 99.87%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model 8: 99.87%\n", + "\n", + "Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain9...\n", + "Accuracy for model 9: 99.87%\n", + "\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "# Load the JSON data\n", + "with open('kb_val.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 128 # Adjust based on your available memory\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "\n", + "# Loop for different finetuned models\n", + "for i in range(1, 10): # Assuming you have 9 finetuned models\n", + " dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}\"\n", + " print(f\"Evaluating with model from {dir}...\")\n", + "\n", + " image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + " tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + " model = AutoModel.from_pretrained(dir)\n", + " clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))\n", + "\n", + " all_predictions = []\n", + " all_true_labels = []\n", + "\n", + " for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + " correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + " accuracy = correct_predictions / len(all_true_labels)\n", + " print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "cb4d1e91", + "metadata": {}, + "source": [ + "# quantization small model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c417e784", + "metadata": { + "jupyter": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "import os\n", + "import torch\n", + "\n", + "# Make only the first GPU visible (GPU 0)\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"\"\n", + "\n", + "# Check if CUDA (GPU support) is available\n", + "if torch.cuda.is_available():\n", + " print(f\"CUDA is available. Number of GPUs: {torch.cuda.device_count()}\")\n", + " \n", + " # Loop through and print details of each GPU\n", + " for i in range(torch.cuda.device_count()):\n", + " print(f\"GPU {i}: {torch.cuda.get_device_name(i)}\")\n", + "else:\n", + " print(\"CUDA is not available. Only CPU will be used.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d9081f63", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "meta\n" + ] + } + ], + "source": [ + "from accelerate.utils import BnbQuantizationConfig\n", + "bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)\n", + "from accelerate.utils import load_and_quantize_model\n", + "from transformers import CLIPProcessor, CLIPModel, AutoModel\n", + "from accelerate import init_empty_weights\n", + "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification\n", + "config = AutoConfig.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n", + "with init_empty_weights():\n", + " empty_model = AutoModelForZeroShotImageClassification.from_config(config)\n", + " #CLIP(config=AutoConfig.from_pretrained(\"openai/clip-vit-large-patch14\"))\n", + "\n", + "# Move the model to GPU 0\n", + "print(empty_model.device)\n", + "dir = \"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2\"\n", + "model = AutoModel.from_pretrained(dir)\n", + "from huggingface_hub import snapshot_download\n", + "weights_location = \"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2/model.safetensors\"\n", + "quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map = \"auto\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2ad06683", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model 0: 96.81%\n", + "\n" + ] + } + ], + "source": [ + "import os\n", + "import csv\n", + "from transformers import CLIPProcessor, CLIPModel, pipeline, CLIPImageProcessor\n", + "from PIL import Image\n", + "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoImageProcessor\n", + "import matplotlib.pyplot as plt\n", + "import time\n", + "import json\n", + "\n", + "# Load the JSON data\n", + "with open('kb_val.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "dir = \"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2\"\n", + "image_processor = AutoImageProcessor.from_pretrained(\n", + " repo_id\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(repo_id, \n", + " config=AutoConfig.from_pretrained(repo_id))\n", + "model = quantized_model\n", + "clip_pipeline = pipeline(model=model,task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " image_processor=image_processor, config=AutoConfig.from_pretrained(dir),\n", + " device_map=\"auto\", model_kwargs={\"load_in_8bit\": True})\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "BATCH_SIZE = 256 # Adjust based on your available memory\n", + "\n", + "all_predictions = []\n", + "all_true_labels = []\n", + "time_eval = []\n", + "\n", + "for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " continue # Skip images that cannot be opened\n", + "\n", + " # Get predictions for the batch of images\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + "accuracy = correct_predictions / len(all_true_labels)\n", + "print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "e179576d", + "metadata": {}, + "source": [ + "## random testing result" + ] + }, + { + "cell_type": "markdown", + "id": "e89c9a67", + "metadata": {}, + "source": [ + "### original model" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "dbfe9e0f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "meta\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "55ac2f92e769452997f907f0301ddf14", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 14 files: 0%| | 0/14 [00:00 for this kind of AutoModel: AutoModelForCausalLM.\nModel type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, LlamaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MvpConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[26], line 8\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification\n\u001b[1;32m 7\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlaion/CLIP-ViT-B-32-laion2B-s34B-b79K\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 8\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mAutoModelForCausalLM\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlaion/CLIP-ViT-B-32-laion2B-s34B-b79K\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mload_in_4bit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py:569\u001b[0m, in \u001b[0;36m_BaseAutoModelClass.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m 565\u001b[0m model_class \u001b[38;5;241m=\u001b[39m _get_model_class(config, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping)\n\u001b[1;32m 566\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m model_class\u001b[38;5;241m.\u001b[39mfrom_pretrained(\n\u001b[1;32m 567\u001b[0m pretrained_model_name_or_path, \u001b[38;5;241m*\u001b[39mmodel_args, config\u001b[38;5;241m=\u001b[39mconfig, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mhub_kwargs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[1;32m 568\u001b[0m )\n\u001b[0;32m--> 569\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 570\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnrecognized configuration class \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconfig\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for this kind of AutoModel: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 571\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mModel type should be one of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(c\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mfor\u001b[39;00m\u001b[38;5;250m \u001b[39mc\u001b[38;5;250m \u001b[39m\u001b[38;5;129;01min\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping\u001b[38;5;241m.\u001b[39mkeys())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 572\u001b[0m )\n", + "\u001b[0;31mValueError\u001b[0m: Unrecognized configuration class for this kind of AutoModel: AutoModelForCausalLM.\nModel type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, LlamaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MvpConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig." + ] + } + ], + "source": [ + "from accelerate.utils import BnbQuantizationConfig\n", + "bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)\n", + "from accelerate.utils import load_and_quantize_model\n", + "from transformers import CLIPProcessor, CLIPModel, AutoModelForCausalLM\n", + "from accelerate import init_empty_weights\n", + "from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification\n", + "tokenizer = AutoTokenizer.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")\n", + "model = AutoModelForCausalLM.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\", device_map=\"auto\", load_in_4bit=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4f624f06", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "meta\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d5d0038ac93240ac849c30801d427222", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 14 files: 0%| | 0/14 [00:00 36\u001b[0m quantized_model \u001b[38;5;241m=\u001b[39m \u001b[43mload_and_quantize_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mempty_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweights_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweights_location\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbnb_quantization_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbnb_quantization_config\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;66;03m# Initialize tokenizer and processor\u001b[39;00m\n\u001b[1;32m 39\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(repo_id)\n", + "File \u001b[0;32m~/anaconda3/envs/huggingface/lib/python3.10/site-packages/accelerate/utils/bnb.py:189\u001b[0m, in \u001b[0;36mload_and_quantize_model\u001b[0;34m(model, bnb_quantization_config, weights_location, device_map, no_split_module_classes, max_memory, offload_folder, offload_state_dict)\u001b[0m\n\u001b[1;32m 185\u001b[0m offload_state_dict \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 187\u001b[0m offload \u001b[38;5;241m=\u001b[39m \u001b[38;5;28many\u001b[39m(x \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(device_map\u001b[38;5;241m.\u001b[39mvalues()) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdisk\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[0;32m--> 189\u001b[0m \u001b[43mload_checkpoint_in_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 190\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 191\u001b[0m \u001b[43m \u001b[49m\u001b[43mweights_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 192\u001b[0m \u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 193\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbnb_quantization_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtorch_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[43m \u001b[49m\u001b[43moffload_folder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffload_folder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 195\u001b[0m \u001b[43m \u001b[49m\u001b[43moffload_state_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffload_state_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 196\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeep_in_fp32_modules\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbnb_quantization_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkeep_in_fp32_modules\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 197\u001b[0m \u001b[43m \u001b[49m\u001b[43moffload_8bit_bnb\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mload_in_8bit\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mand\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43moffload\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 198\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 199\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m dispatch_model(model, device_map\u001b[38;5;241m=\u001b[39mdevice_map, offload_dir\u001b[38;5;241m=\u001b[39moffload_folder)\n", + "File \u001b[0;32m~/anaconda3/envs/huggingface/lib/python3.10/site-packages/accelerate/utils/modeling.py:1336\u001b[0m, in \u001b[0;36mload_checkpoint_in_model\u001b[0;34m(model, checkpoint, device_map, offload_folder, dtype, offload_state_dict, offload_buffers, keep_in_fp32_modules, offload_8bit_bnb)\u001b[0m\n\u001b[1;32m 1334\u001b[0m offload_weight(param, param_name, state_dict_folder, index\u001b[38;5;241m=\u001b[39mstate_dict_index)\n\u001b[1;32m 1335\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1336\u001b[0m \u001b[43mset_module_tensor_to_device\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1337\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1338\u001b[0m \u001b[43m \u001b[49m\u001b[43mparam_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1339\u001b[0m \u001b[43m \u001b[49m\u001b[43mparam_device\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1340\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparam\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1341\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnew_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1342\u001b[0m \u001b[43m \u001b[49m\u001b[43mfp16_statistics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfp16_statistics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1343\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1345\u001b[0m \u001b[38;5;66;03m# Force Python to clean up.\u001b[39;00m\n\u001b[1;32m 1346\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m checkpoint\n", + "File \u001b[0;32m~/anaconda3/envs/huggingface/lib/python3.10/site-packages/accelerate/utils/modeling.py:344\u001b[0m, in \u001b[0;36mset_module_tensor_to_device\u001b[0;34m(module, tensor_name, device, value, dtype, fp16_statistics)\u001b[0m\n\u001b[1;32m 342\u001b[0m device_index \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mdevice(device)\u001b[38;5;241m.\u001b[39mindex \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mdevice(device)\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 343\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(module\u001b[38;5;241m.\u001b[39mweight, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquant_state\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;129;01mand\u001b[39;00m device_index \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 344\u001b[0m module\u001b[38;5;241m.\u001b[39mweight \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcuda\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice_index\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 345\u001b[0m \u001b[38;5;66;03m# clean pre and post foward hook\u001b[39;00m\n\u001b[1;32m 346\u001b[0m torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mempty_cache()\n", + "File \u001b[0;32m~/anaconda3/envs/huggingface/lib/python3.10/site-packages/bitsandbytes/nn/modules.py:168\u001b[0m, in \u001b[0;36mParams4bit.cuda\u001b[0;34m(self, device)\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcuda\u001b[39m(\u001b[38;5;28mself\u001b[39m, device):\n\u001b[0;32m--> 168\u001b[0m w \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontiguous\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhalf\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcuda\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 169\u001b[0m w_4bit, quant_state \u001b[38;5;241m=\u001b[39m bnb\u001b[38;5;241m.\u001b[39mfunctional\u001b[38;5;241m.\u001b[39mquantize_4bit(w, blocksize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblocksize, compress_statistics\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompress_statistics, quant_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mquant_type)\n\u001b[1;32m 170\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdata \u001b[38;5;241m=\u001b[39m w_4bit\n", + "\u001b[0;31mNotImplementedError\u001b[0m: Cannot copy out of meta tensor; no data!" + ] + } + ], + "source": [ + "from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model\n", + "from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoConfig, AutoModelForZeroShotImageClassification, pipeline, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import json\n", + "\n", + "# Load the JSON data for evaluation\n", + "with open('val_random.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 256 # Adjust based on your available memory\n", + "\n", + "# Define a function to process images in chunks\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "# Base configuration\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "\n", + "# Iterate over the 10 models\n", + "for i in range(1, 10): # Assuming you have 19 finetuned models\n", + " dir = f\"./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain{i}\"\n", + " print(f\"Evaluating with model from {dir}...\")\n", + " \n", + " # Load and quantize model\n", + " with init_empty_weights():\n", + " empty_model = AutoModelForZeroShotImageClassification.from_config(AutoConfig.from_pretrained(repo_id))\n", + "\n", + " # Load the model from the directory\n", + " weights_location = f\"{dir}/model.safetensors\"\n", + " quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map=\"auto\")\n", + "\n", + " # Initialize tokenizer and processor\n", + " tokenizer = AutoTokenizer.from_pretrained(repo_id)\n", + " image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + " \n", + " # Initialize the pipeline without the device argument\n", + " clip_pipeline = pipeline(\n", + " model=quantized_model,\n", + " task=\"zero-shot-image-classification\",\n", + " tokenizer=tokenizer,\n", + " image_processor=image_processor,\n", + " config=AutoConfig.from_pretrained(repo_id),\n", + " model_kwargs={\"load_in_8bit\": True}\n", + " )\n", + "\n", + " all_predictions = []\n", + " all_true_labels = []\n", + "\n", + " # Process images in chunks and evaluate\n", + " for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " # Open image and append to batch_images\n", + " with Image.open(path) as img:\n", + " batch_images.append(img.convert(\"RGB\"))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " # Skip images that cannot be opened\n", + " continue\n", + "\n", + " # Get predictions for the batch of images\n", + " try:\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + " except Exception as e:\n", + " print(f\"An error occurred during prediction: {e}\")\n", + " continue\n", + "\n", + " # Calculate the accuracy for the current model\n", + " correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + " accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0\n", + " print(f\"Accuracy for model {i}: {accuracy * 100:.2f}%\\n\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "2dd47a7b", + "metadata": {}, + "source": [ + "# Efficiency comparison" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b7a2d41b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Please open a PR/issue to update `preprocessor_config.json` to use `image_processor_type` instead of `feature_extractor_type`. This warning will be removed in v4.40.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy for model org: 73.67%\n", + "\n", + "Accuracy for each category:\n", + "a photo of standard bathroom: 0.92\n", + "a photo of standard kitchen: 0.98\n", + "a photo of contemporary bathroom: 0.52\n", + "a photo of contemporary kitchen: 0.39\n", + " Category Accuracy\n", + "0 a photo of standard bathroom 0.920690\n", + "1 a photo of standard kitchen 0.980892\n", + "2 a photo of contemporary bathroom 0.516949\n", + "3 a photo of contemporary kitchen 0.385027\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "STAGE:2024-04-22 17:39:10 514141:514141 ActivityProfilerController.cpp:311] Completed Stage: Warm Up\n", + "STAGE:2024-04-22 17:39:10 514141:514141 ActivityProfilerController.cpp:317] Completed Stage: Collection\n", + "STAGE:2024-04-22 17:39:10 514141:514141 ActivityProfilerController.cpp:321] Completed Stage: Post Processing\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from collections import Counter\n", + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import torch\n", + "import torch.profiler\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "def calculate_category_accuracy(true_labels, predicted_labels):\n", + " \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n", + " accuracies = {}\n", + " true_labels_counter = Counter(true_labels)\n", + " correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n", + " \n", + " for label in true_labels_counter:\n", + " accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n", + " accuracies[label] = accuracy\n", + " \n", + " return accuracies\n", + "\n", + "# Load the JSON data\n", + "with open('kb_val.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 1024 # Adjust based on your available memory\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "\n", + "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + "model = AutoModel.from_pretrained(repo_id).cuda()\n", + "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))\n", + "\n", + "all_predictions = []\n", + "all_true_labels = []\n", + "\n", + "def trace_handler(profiler):\n", + " print(\"Trace handler called\")\n", + " try:\n", + " print(profiler.key_averages().table(sort_by=\"cpu_time_total\", row_limit=10))\n", + " profiler.export_chrome_trace(\"trace.json\")\n", + " except Exception as e:\n", + " print(f\"Error in trace handler: {str(e)}\")\n", + "# Start profiling\n", + "with torch.profiler.profile(\n", + " activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],\n", + " schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),\n", + " on_trace_ready=trace_handler,\n", + " record_shapes=True,\n", + " profile_memory=True,\n", + " with_stack=True\n", + ") as profiler:\n", + " for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path).convert('RGB'))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " print(f\"Skipping file: {path}, unable to open or not found.\")\n", + " continue # Skip images that cannot be opened\n", + "\n", + " if batch_images: # Ensure there are images to predict\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + " profiler.step() # Advance the profiler\n", + "\n", + "# Calculate and print overall accuracy\n", + "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + "accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0\n", + "print(f\"Accuracy for model org: {accuracy * 100:.2f}%\\n\")\n", + "\n", + "# Calculate and print category accuracies\n", + "category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n", + "print(\"Accuracy for each category:\")\n", + "for category, accuracy in category_accuracies.items():\n", + " print(f\"{category}: {accuracy:.2f}\")\n", + "\n", + "# Convert the dictionary to a DataFrame\n", + "category_accuracy_df = pd.DataFrame(list(category_accuracies.items()), columns=['Category', 'Accuracy'])\n", + "\n", + "print(category_accuracy_df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1595577", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from collections import Counter\n", + "import os\n", + "import json\n", + "from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor\n", + "from PIL import Image, UnidentifiedImageError\n", + "import torch\n", + "import torch.profiler\n", + "\n", + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "def calculate_category_accuracy(true_labels, predicted_labels):\n", + " \"\"\"Calculate the accuracy for each category and return it as a dictionary.\"\"\"\n", + " accuracies = {}\n", + " true_labels_counter = Counter(true_labels)\n", + " correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])\n", + " \n", + " for label in true_labels_counter:\n", + " accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0\n", + " accuracies[label] = accuracy\n", + " \n", + " return accuracies\n", + "\n", + "# Load the JSON data\n", + "with open('kb_val.json', 'r') as f:\n", + " data = [json.loads(line) for line in f]\n", + "\n", + "# Extract image paths and labels\n", + "image_paths = [item['image'] for item in data]\n", + "labels = [item['caption'] for item in data]\n", + "\n", + "BATCH_SIZE = 1024 # Adjust based on your available memory\n", + "repo_id = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n", + "model_dir = f\"./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold3\"\n", + "\n", + "image_processor = AutoImageProcessor.from_pretrained(repo_id)\n", + "tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))\n", + "model = AutoModel.from_pretrained(model_dir).cuda()\n", + "clip_pipeline = pipeline(model=model, task=\"zero-shot-image-classification\", tokenizer=tokenizer,\n", + " device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(model_dir))\n", + "\n", + "all_predictions = []\n", + "all_true_labels = []\n", + "\n", + "def trace_handler(profiler):\n", + " print(\"Trace handler called\")\n", + " try:\n", + " print(profiler.key_averages().table(sort_by=\"cpu_time_total\", row_limit=10))\n", + " profiler.export_chrome_trace(\"trace.json\")\n", + " except Exception as e:\n", + " print(f\"Error in trace handler: {str(e)}\")\n", + "# Start profiling\n", + "with torch.profiler.profile(\n", + " activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],\n", + " schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),\n", + " on_trace_ready=trace_handler,\n", + " record_shapes=True,\n", + " profile_memory=True,\n", + " with_stack=True\n", + ") as profiler:\n", + " for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):\n", + " batch_images = []\n", + " valid_labels = []\n", + " for path, label in zip(batch_paths, batch_labels):\n", + " try:\n", + " batch_images.append(Image.open(path).convert('RGB'))\n", + " valid_labels.append(label)\n", + " except (FileNotFoundError, UnidentifiedImageError):\n", + " print(f\"Skipping file: {path}, unable to open or not found.\")\n", + " continue # Skip images that cannot be opened\n", + "\n", + " if batch_images: # Ensure there are images to predict\n", + " predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)\n", + " predicted_labels = [pred[0]['label'] for pred in predictions] # Top prediction\n", + "\n", + " all_predictions.extend(predicted_labels)\n", + " all_true_labels.extend(valid_labels)\n", + "\n", + " profiler.step() # Advance the profiler\n", + "\n", + "# Calculate and print overall accuracy\n", + "correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])\n", + "accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0\n", + "print(f\"Accuracy for model org: {accuracy * 100:.2f}%\\n\")\n", + "\n", + "# Calculate and print category accuracies\n", + "category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)\n", + "print(\"Accuracy for each category:\")\n", + "for category, accuracy in category_accuracies.items():\n", + " print(f\"{category}: {accuracy:.2f}\")\n", + "\n", + "# Convert the dictionary to a DataFrame\n", + "category_accuracy_df = pd.DataFrame(list(category_accuracies.items()), columns=['Category', 'Accuracy'])\n", + "\n", + "print(category_accuracy_df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f35e96a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe5592a4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}