# random test 20%

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

def prepare_data_from_dataframe(df: pd.DataFrame, output_file: str) -> str:
    """Prepare data for finetuning by reading from a DataFrame."""
    data = []

    # Process data
    for _, row in df.iterrows():
        image_path = f"../data/{row['max_key']}/{row['path'].split('/')[-1]}"
        data.append({"image": image_path, "caption": row['max_key']})

    # Save the data in JSON format
    with open(output_file, "w") as f:
        for item in data:
            json.dump(item, f)
            f.write("\n")

    return output_file


# Load the CSV file
df = pd.read_csv('labels.csv')
# Filter out specific categories
df = df[df['max_key'] != 'error']
df = df[df['max_key'] != 'a photo of other indoor space: not kitchen, not bathroom, not living room, not dining room, not foyer']
df = df[df['max_key'] != 'it is a artificial photo']
df = df[df['max_key'] != 'a photo of outdoor space']
# Filter samples with max_value > 0.9
threshold_df = df[df['max_value'] > 0.9]

# Split data into train and test sets
train_df, test_df = train_test_split(threshold_df, test_size=0.2, random_state=42)

# Now use the train_df to prepare your training data
train_all_json = prepare_data_from_dataframe(train_df, 'train_random.json')
# And test_df to prepare your testing data
test_json = prepare_data_from_dataframe(test_df, 'val_random.json')

# The function prepare_data_from_dataframe remains unchanged


In [1]:
# test from org ckpt
import pandas as pd
from collections import Counter
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def calculate_category_accuracy(true_labels, predicted_labels):
    """Calculate the accuracy for each category and return it as a dictionary."""
    accuracies = {}
    true_labels_counter = Counter(true_labels)
    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])
    
    for label in true_labels_counter:
        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0
        accuracies[label] = accuracy
    
    return accuracies

# Load the JSON data
with open('val_random.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 128  # Adjust based on your available memory
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"


image_processor = AutoImageProcessor.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
model = AutoModel.from_pretrained(repo_id)
clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))

all_predictions = []
all_true_labels = []

for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
    batch_images = []
    valid_labels = []
    for path, label in zip(batch_paths, batch_labels):
        try:
            batch_images.append(Image.open(path))
            valid_labels.append(label)
        except (FileNotFoundError, UnidentifiedImageError):
            continue  # Skip images that cannot be opened

    # Get predictions for the batch of images
    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

    all_predictions.extend(predicted_labels)
    all_true_labels.extend(valid_labels)

correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
accuracy = correct_predictions / len(all_true_labels)
print(f"Accuracy for model org: {accuracy * 100:.2f}%\n")

# Calculate the accuracy for each category
category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)

# Print the accuracy for each category
print("Accuracy for each category:")
for category, accuracy in category_accuracies.items():
    print(f"{category}: {accuracy:.2f}")

# Convert the dictionary to a DataFrame
category_accuracy_df = pd.DataFrame(list(category_accuracies.items()), columns=['Category', 'Accuracy'])

# Save the DataFrame to a CSV file
category_accuracy_df.to_csv('category_accuracy.csv', index=False)


KeyboardInterrupt



In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
unique_labels = list(set(all_true_labels))
# Compute the confusion matrix
cm = confusion_matrix(all_true_labels, all_predictions, labels=unique_labels)

# Display the confusion matrix
fig, ax = plt.subplots(figsize=(10, 10))  # Adjust the size as needed
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=unique_labels)
disp.plot(ax=ax)

# Rotate the x-axis labels to display them vertically
plt.xticks(rotation=90)

plt.show()

In [2]:
import pandas as pd
from sklearn.model_selection import KFold
import json

def prepare_data_from_dataframe(df, output_file):
    """Prepare data for fine-tuning by reading from a DataFrame."""
    data = []
    # Process data
    for _, row in df.iterrows():
        image_path = f"../data/{row['max_key']}/{row['path'].split('/')[-1]}"
        data.append({"image": image_path, "caption": row['max_key']})
    # Save the data in JSON format
    with open(output_file, "w") as f:
        for item in data:
            json.dump(item, f)
            f.write("\n")

def perform_k_fold(df, n_splits):
    """Perform K-fold split and data preparation, including filtering by confidence thresholds."""
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_data_info = []
    fold_counter = 0

    for train_index, test_index in kf.split(df):
        train_df, test_df = df.iloc[train_index], df.iloc[test_index]
        test_json = f'val_fold_{fold_counter}.json'
        prepare_data_from_dataframe(test_df, test_json)
        
        # Creating JSON files for each confidence threshold from 0.91 to 0.99
        for threshold in range(90, 100):
            threshold_df = train_df[train_df['max_value'] > threshold / 100.0]
            threshold_train_json = f'train_fold_{fold_counter}_thr_{threshold}.json'
            prepare_data_from_dataframe(threshold_df, threshold_train_json)
            fold_data_info.append((threshold_train_json, test_json))
        
        fold_counter += 1

    return fold_data_info

# Load and preprocess the DataFrame
df = pd.read_csv('labels.csv')
df = df[df['max_key'] != 'error']
df = df[df['max_key'] != 'a photo of other indoor space: not kitchen, not bathroom, not living room, not dining room, not foyer']
df = df[df['max_key'] != 'it is a artificial photo']
df = df[df['max_key'] != 'a photo of outdoor space']
df = df[df['max_value'] > 0.9]

# Perform the 5-fold split and data preparation including threshold filtering
fold_files = perform_k_fold(df, 5)

# Print out the file names for each fold and threshold
for train_file, test_file in fold_files:
    print(f"Train file for this fold and threshold: {train_file}")
    print(f"Test file for this fold: {test_file}")

Train file for this fold: train_fold_0.json
Test file for this fold: val_fold_0.json
Train file for this fold: train_fold_1.json
Test file for this fold: val_fold_1.json
Train file for this fold: train_fold_2.json
Test file for this fold: val_fold_2.json
Train file for this fold: train_fold_3.json
Test file for this fold: val_fold_3.json
Train file for this fold: train_fold_4.json
Test file for this fold: val_fold_4.json


In [11]:
# test from org ckpt
import pandas as pd
import numpy as np
from collections import Counter
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def calculate_category_accuracy(true_labels, predicted_labels):
    """Calculate the accuracy for each category and return it as a dictionary."""
    accuracies = {}
    true_labels_counter = Counter(true_labels)
    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])
    
    for label in true_labels_counter:
        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0
        accuracies[label] = accuracy
    
    return accuracies

# Load model components
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
image_processor = AutoImageProcessor.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
model = AutoModel.from_pretrained(repo_id)
clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))

# DataFrame to store all accuracies
all_accuracies = []

# Loop over each fold and test
for fold in range(2):
    # Load the JSON data for the current fold
    with open(f'val_fold_{fold}.json', 'r') as f:
        data = [json.loads(line) for line in f]

    # Extract image paths and labels for the current fold
    image_paths = [item['image'] for item in data]
    labels = [item['caption'] for item in data]

    BATCH_SIZE = 128  # Adjust based on your available memory
    all_predictions = []
    all_true_labels = []

    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
        batch_images = []
        valid_labels = []
        for path, label in zip(batch_paths, batch_labels):
            try:
                batch_images.append(Image.open(path))
                valid_labels.append(label)
            except (FileNotFoundError, UnidentifiedImageError):
                continue  # Skip images that cannot be opened

        # Get predictions for the batch of images
        predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
        predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

        all_predictions.extend(predicted_labels)
        all_true_labels.extend(valid_labels)

    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
    accuracy = correct_predictions / len(all_true_labels)
    print(f"Accuracy for model on fold {fold}: {accuracy * 100:.2f}%\n")

    # Calculate the accuracy for each category for the current fold
    category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)

    # Store the accuracies in a list of dictionaries
    for category, acc in category_accuracies.items():
        all_accuracies.append({'Fold': fold, 'Category': category, 'Accuracy': acc})
    # Add overall accuracy for the current fold to the list
    all_accuracies.append({'Fold': fold, 'Category': 'Overall', 'Accuracy': accuracy})

# Convert the list of dictionaries to a DataFrame
all_accuracies_df = pd.DataFrame(all_accuracies)

# Save the DataFrame to a CSV file
all_accuracies_df.to_csv('./result/org_results.csv', index=False)

# Print out the final DataFrame
print(all_accuracies_df)




Accuracy for model on fold 0: 83.15%

Accuracy for model on fold 1: 84.30%

    Fold                             Category  Accuracy
0      0        a photo of contemporary foyer  0.804124
1      0          a photo of standard kitchen  1.000000
2      0     a photo of contemporary bathroom  0.667910
3      0            a photo of standard foyer  0.868794
4      0         a photo of standard bathroom  0.994197
5      0  a photo of contemporary dining room  0.952632
6      0      a photo of standard living room  0.986486
7      0  a photo of contemporary living room  0.534050
8      0      a photo of contemporary kitchen  0.689655
9      0      a photo of standard dining room  1.000000
10     0                              Overall  0.831540
11     1         a photo of standard bathroom  0.988539
12     1  a photo of contemporary dining room  0.957871
13     1  a photo of contemporary living room  0.597701
14     1            a photo of standard foyer  0.845896
15     1      a photo of con

In [5]:
import numpy as np
import transformers
from datasets import load_dataset
import pathlib
from typing import Generator
from collections import defaultdict
import sys
import json
import os

# Assuming prepare_data_from_dataframe is defined elsewhere in your project

# Fine-tune base model setup
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
transformers.utils.logging.set_verbosity_error()

# Main training loop
for fold in range(5):  # Five folds
    for i, threshold in enumerate(np.arange(0.90, 1.00, 0.01), start=1):
        # Filename setup for training data at current threshold and fold
        train_json = f'train_fold_{fold}_thr_{int(threshold*100)}.json'
        test_json = f'val_fold_{fold}.json'  # Validation data for current fold

        # Output directory for the trained model
        output_folder = f"./workspace/output/laion-finetuned_v5e7_epoch10_fold{fold}_threshold{i}"
        print(f"Finetuning {repo_id} for fold {fold}, threshold > {threshold:.2f}, saving output to {output_folder}.")

        # Load dataset
        data_files = {'train': train_json, 'validation': test_json}
        dataset = load_dataset("json", data_files=data_files)
        print(f"First image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'")

        !python huggingface_finetune_clip.py \
            --output_dir {output_folder} --model_name_or_path {repo_id} \
            --train_file {train_json} \
            --validation_file {test_json} \
            --image_column image \
            --overwrite_output_dir=True \
            --max_seq_length=77 \
            --num_train_epochs=10 \
            --save_total_limit=5 \
            --caption_column caption \
            --remove_unused_columns=False \
            --do_train \
            --logging_strategy="epoch"\
            --per_device_train_batch_size=128 \
            --dataloader_drop_last=True\
            --learning_rate="1e-6" --warmup_steps="0" --weight_decay 0.1 
        print(f"--\nDONE. If it worked, trained data should be in {output_folder}\n")

Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for fold 0, threshold > 0.90, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_fold0_threshold1.


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'
Filter: 100%|███████████████████| 18410/18410 [00:00<00:00, 27724.21 examples/s]
Running tokenizer on train dataset: 100%|█| 18185/18185 [00:00<00:00, 19921.85 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1128, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9296, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8839, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8626, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8445, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8329, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8247, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'los

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'
Filter: 100%|███████████████████| 16981/16981 [00:00<00:00, 27929.24 examples/s]
Running tokenizer on train dataset: 100%|█| 16772/16772 [00:00<00:00, 19787.47 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.124, 'learning_rate': 9e-07, 'epoch': 1.0}                           
{'loss': 2.9336, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8901, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8663, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8506, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8384, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8319, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'los

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'
Filter: 100%|███████████████████| 15451/15451 [00:00<00:00, 28962.66 examples/s]
Running tokenizer on train dataset: 100%|█| 15258/15258 [00:00<00:00, 21247.67 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1249, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9372, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8912, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8686, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8503, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8397, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8334, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'los

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'
Filter: 100%|███████████████████| 13850/13850 [00:00<00:00, 29143.26 examples/s]
Running tokenizer on train dataset: 100%|█| 13679/13679 [00:00<00:00, 21199.23 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1247, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9428, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8945, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8685, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8509, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8402, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.836, 'learning_rate': 3e-07, 'epoch': 7.0}                           
{'los

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'
Filter: 100%|███████████████████| 12273/12273 [00:00<00:00, 29537.34 examples/s]
Running tokenizer on train dataset: 100%|█| 12122/12122 [00:00<00:00, 21387.33 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.143, 'learning_rate': 9e-07, 'epoch': 1.0}                           
{'loss': 2.9466, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9004, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8732, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8586, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8484, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.841, 'learning_rate': 3e-07, 'epoch': 7.0}                           
{'los

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'
Filter: 100%|███████████████████| 10547/10547 [00:00<00:00, 29168.34 examples/s]
Running tokenizer on train dataset: 100%|█| 10415/10415 [00:00<00:00, 21648.30 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1485, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9515, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9065, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8756, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8588, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8529, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8444, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'los

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'
Filter: 100%|█████████████████████| 8641/8641 [00:00<00:00, 28738.00 examples/s]
Running tokenizer on train dataset: 100%|█| 8534/8534 [00:00<00:00, 21546.58 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1592, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9657, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9132, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8893, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8663, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8574, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8519, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'los

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'
Filter: 100%|█████████████████████| 6532/6532 [00:00<00:00, 28798.38 examples/s]
Running tokenizer on train dataset: 100%|█| 6445/6445 [00:00<00:00, 21433.02 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1842, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9829, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9208, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.9018, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8811, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8685, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8624, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'los

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'
Filter: 100%|█████████████████████| 4144/4144 [00:00<00:00, 27507.68 examples/s]
Running tokenizer on train dataset: 100%|█| 4101/4101 [00:00<00:00, 21094.17 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.2523, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 3.037, 'learning_rate': 8e-07, 'epoch': 2.0}                           
{'loss': 2.971, 'learning_rate': 7e-07, 'epoch': 3.0}                           
{'loss': 2.9345, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.9073, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8994, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8832, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'los

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard living room/IMG-C5471443_15.jpg, caption: 'a photo of standard living room'
Filter: 100%|█████████████████████| 1382/1382 [00:00<00:00, 24913.73 examples/s]
Running tokenizer on train dataset: 100%|█| 1366/1366 [00:00<00:00, 19082.86 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.3162, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 3.1668, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 3.0819, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 3.0353, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.9945, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.9998, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.9528, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'los

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'
Filter: 100%|███████████████████| 18410/18410 [00:00<00:00, 29787.21 examples/s]
Running tokenizer on train dataset: 100%|█| 18195/18195 [00:00<00:00, 21378.62 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1111, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9297, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8829, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8577, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.842, 'learning_rate': 5e-07, 'epoch': 5.0}                           
{'loss': 2.8301, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8235, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'lo

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'
Filter: 100%|███████████████████| 16909/16909 [00:00<00:00, 29673.49 examples/s]
Running tokenizer on train dataset: 100%|█| 16715/16715 [00:00<00:00, 21582.93 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1235, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9345, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8845, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.86, 'learning_rate': 6e-07, 'epoch': 4.0}                            
{'loss': 2.843, 'learning_rate': 5e-07, 'epoch': 5.0}                           
{'loss': 2.8351, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8282, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'lo

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'
Filter: 100%|███████████████████| 15411/15411 [00:00<00:00, 29610.96 examples/s]
Running tokenizer on train dataset: 100%|█| 15237/15237 [00:00<00:00, 21476.83 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1232, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9345, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8848, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8607, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8491, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8364, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.828, 'learning_rate': 3e-07, 'epoch': 7.0}                           
{'lo

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'
Filter: 100%|███████████████████| 13823/13823 [00:00<00:00, 29426.80 examples/s]
Running tokenizer on train dataset: 100%|█| 13667/13667 [00:00<00:00, 21484.43 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1317, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9362, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8892, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8612, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8485, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8351, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8294, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'lo

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'
Filter: 100%|███████████████████| 12236/12236 [00:00<00:00, 29435.37 examples/s]
Running tokenizer on train dataset: 100%|█| 12097/12097 [00:00<00:00, 21528.46 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1422, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9471, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8954, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8699, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8552, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8445, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.839, 'learning_rate': 3e-07, 'epoch': 7.0}                           
{'lo

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'
Filter: 100%|███████████████████| 10506/10506 [00:00<00:00, 29337.22 examples/s]
Running tokenizer on train dataset: 100%|█| 10385/10385 [00:00<00:00, 21456.00 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.144, 'learning_rate': 9e-07, 'epoch': 1.0}                           
{'loss': 2.9529, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9006, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8747, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8614, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.845, 'learning_rate': 4e-07, 'epoch': 6.0}                           
{'loss': 2.8385, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'lo

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'
Filter: 100%|█████████████████████| 8620/8620 [00:00<00:00, 29092.75 examples/s]
Running tokenizer on train dataset: 100%|█| 8522/8522 [00:00<00:00, 21526.17 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1682, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9581, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9064, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.88, 'learning_rate': 6e-07, 'epoch': 4.0}                            
{'loss': 2.8666, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8547, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8433, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'lo

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'
Filter: 100%|█████████████████████| 6483/6483 [00:00<00:00, 28558.51 examples/s]
Running tokenizer on train dataset: 100%|█| 6407/6407 [00:00<00:00, 21230.77 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1928, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.988, 'learning_rate': 8e-07, 'epoch': 2.0}                           
{'loss': 2.9295, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8963, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8795, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8679, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8546, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'lo

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'
Filter: 100%|█████████████████████| 4130/4130 [00:00<00:00, 27951.33 examples/s]
Running tokenizer on train dataset: 100%|█| 4098/4098 [00:00<00:00, 21248.01 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.2369, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 3.0299, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9588, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.9176, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.9002, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8889, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8884, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'lo

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary bathroom/IMG-C5471425_6.jpg, caption: 'a photo of contemporary bathroom'
Filter: 100%|█████████████████████| 1366/1366 [00:00<00:00, 24988.85 examples/s]
Running tokenizer on train dataset: 100%|█| 1357/1357 [00:00<00:00, 20474.15 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.2915, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 3.1362, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 3.0763, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.9976, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.973, 'learning_rate': 5e-07, 'epoch': 5.0}                           
{'loss': 2.9411, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.9475, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'lo

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 18410/18410 [00:00<00:00, 29189.51 examples/s]
Running tokenizer on train dataset: 100%|█| 18200/18200 [00:00<00:00, 21406.86 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1179, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9326, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8848, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8607, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8437, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8292, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8239, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 16939/16939 [00:00<00:00, 29678.72 examples/s]
Running tokenizer on train dataset: 100%|█| 16749/16749 [00:00<00:00, 21520.55 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1216, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9348, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.889, 'learning_rate': 7e-07, 'epoch': 3.0}                           
{'loss': 2.8623, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8479, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8334, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8251, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 15439/15439 [00:00<00:00, 28950.30 examples/s]
Running tokenizer on train dataset: 100%|█| 15263/15263 [00:00<00:00, 21450.38 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1272, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.936, 'learning_rate': 8e-07, 'epoch': 2.0}                           
{'loss': 2.8881, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8618, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8478, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8358, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8293, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 13827/13827 [00:00<00:00, 28941.77 examples/s]
Running tokenizer on train dataset: 100%|█| 13671/13671 [00:00<00:00, 21331.84 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1357, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9388, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8912, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.864, 'learning_rate': 6e-07, 'epoch': 4.0}                           
{'loss': 2.8456, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8359, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8279, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 12224/12224 [00:00<00:00, 28843.71 examples/s]
Running tokenizer on train dataset: 100%|█| 12087/12087 [00:00<00:00, 20386.48 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1495, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9512, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8971, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.872, 'learning_rate': 6e-07, 'epoch': 4.0}                           
{'loss': 2.8551, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8439, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8354, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 10488/10488 [00:00<00:00, 28834.60 examples/s]
Running tokenizer on train dataset: 100%|█| 10369/10369 [00:00<00:00, 21240.58 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.155, 'learning_rate': 9e-07, 'epoch': 1.0}                           
{'loss': 2.9565, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9022, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.877, 'learning_rate': 6e-07, 'epoch': 4.0}                           
{'loss': 2.8574, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8497, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8423, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|█████████████████████| 8586/8586 [00:00<00:00, 27862.04 examples/s]
Running tokenizer on train dataset: 100%|█| 8492/8492 [00:00<00:00, 20458.35 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1604, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9659, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9069, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8841, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8648, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8528, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.843, 'learning_rate': 3e-07, 'epoch': 7.0}                           
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|█████████████████████| 6497/6497 [00:00<00:00, 27474.72 examples/s]
Running tokenizer on train dataset: 100%|█| 6425/6425 [00:00<00:00, 19865.38 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.186, 'learning_rate': 9e-07, 'epoch': 1.0}                           
{'loss': 2.981, 'learning_rate': 8e-07, 'epoch': 2.0}                           
{'loss': 2.9262, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8907, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8815, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8675, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8573, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|█████████████████████| 4151/4151 [00:00<00:00, 25735.84 examples/s]
Running tokenizer on train dataset: 100%|█| 4118/4118 [00:00<00:00, 18134.09 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.2318, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 3.0349, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9728, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.9344, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.9138, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8937, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8847, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_18.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|█████████████████████| 1357/1357 [00:00<00:00, 25011.41 examples/s]
Running tokenizer on train dataset: 100%|█| 1346/1346 [00:00<00:00, 19520.80 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.3296, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 3.1602, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 3.0947, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 3.0176, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.9972, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.9861, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.9694, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 18411/18411 [00:00<00:00, 27826.86 examples/s]
Running tokenizer on train dataset: 100%|█| 18189/18189 [00:00<00:00, 20605.62 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1125, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9313, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8839, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8626, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8437, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8323, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8251, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 16936/16936 [00:00<00:00, 28639.94 examples/s]
Running tokenizer on train dataset: 100%|█| 16736/16736 [00:00<00:00, 20420.36 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1217, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.935, 'learning_rate': 8e-07, 'epoch': 2.0}                           
{'loss': 2.8864, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8654, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8527, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8378, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8302, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 15422/15422 [00:00<00:00, 29274.55 examples/s]
Running tokenizer on train dataset: 100%|█| 15240/15240 [00:00<00:00, 20987.70 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1276, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9378, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8892, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8646, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8497, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8379, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8334, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 13849/13849 [00:00<00:00, 28925.33 examples/s]
Running tokenizer on train dataset: 100%|█| 13688/13688 [00:00<00:00, 21104.87 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1344, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9381, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8884, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8674, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8484, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8362, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8305, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 12276/12276 [00:00<00:00, 29087.75 examples/s]
Running tokenizer on train dataset: 100%|█| 12136/12136 [00:00<00:00, 21124.55 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1427, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.954, 'learning_rate': 8e-07, 'epoch': 2.0}                           
{'loss': 2.906, 'learning_rate': 7e-07, 'epoch': 3.0}                           
{'loss': 2.8737, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8611, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8473, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.845, 'learning_rate': 3e-07, 'epoch': 7.0}                           
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 10542/10542 [00:00<00:00, 28562.65 examples/s]
Running tokenizer on train dataset: 100%|█| 10420/10420 [00:00<00:00, 21241.45 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1515, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.958, 'learning_rate': 8e-07, 'epoch': 2.0}                           
{'loss': 2.9053, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8771, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8623, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8556, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8463, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|█████████████████████| 8644/8644 [00:00<00:00, 28711.30 examples/s]
Running tokenizer on train dataset: 100%|█| 8546/8546 [00:00<00:00, 21136.89 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1591, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9654, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9102, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8858, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8679, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8606, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8502, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|█████████████████████| 6509/6509 [00:00<00:00, 28510.06 examples/s]
Running tokenizer on train dataset: 100%|█| 6434/6434 [00:00<00:00, 21116.37 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1949, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9889, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9326, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.9016, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8828, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8768, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8647, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|█████████████████████| 4145/4145 [00:00<00:00, 27460.08 examples/s]
Running tokenizer on train dataset: 100%|█| 4104/4104 [00:00<00:00, 21182.78 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.2377, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 3.03, 'learning_rate': 8e-07, 'epoch': 2.0}                            
{'loss': 2.9677, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.923, 'learning_rate': 6e-07, 'epoch': 4.0}                           
{'loss': 2.8979, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8889, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8798, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of contemporary foyer/IMG-C5471443_13.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|█████████████████████| 1384/1384 [00:00<00:00, 25025.29 examples/s]
Running tokenizer on train dataset: 100%|█| 1369/1369 [00:00<00:00, 20654.61 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.3216, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 3.164, 'learning_rate': 8e-07, 'epoch': 2.0}                           
{'loss': 3.0684, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 3.0369, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.9836, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.975, 'learning_rate': 4e-07, 'epoch': 6.0}                           
{'loss': 2.9718, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 18411/18411 [00:00<00:00, 28710.80 examples/s]
Running tokenizer on train dataset: 100%|█| 18191/18191 [00:00<00:00, 21283.84 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1142, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9305, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8865, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8604, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8448, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.834, 'learning_rate': 4e-07, 'epoch': 6.0}                           
{'loss': 2.8264, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 16919/16919 [00:00<00:00, 29294.57 examples/s]
Running tokenizer on train dataset: 100%|█| 16720/16720 [00:00<00:00, 21354.41 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1168, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.935, 'learning_rate': 8e-07, 'epoch': 2.0}                           
{'loss': 2.8881, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8599, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8448, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8351, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.825, 'learning_rate': 3e-07, 'epoch': 7.0}                           
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 15445/15445 [00:00<00:00, 29273.29 examples/s]
Running tokenizer on train dataset: 100%|█| 15266/15266 [00:00<00:00, 21215.94 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1221, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.932, 'learning_rate': 8e-07, 'epoch': 2.0}                           
{'loss': 2.8869, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8632, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8437, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8344, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8296, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 13827/13827 [00:00<00:00, 29146.08 examples/s]
Running tokenizer on train dataset: 100%|█| 13667/13667 [00:00<00:00, 21224.77 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1325, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9345, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8899, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8631, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8451, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8378, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8306, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 12231/12231 [00:00<00:00, 28084.34 examples/s]
Running tokenizer on train dataset: 100%|█| 12086/12086 [00:00<00:00, 20686.81 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1374, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9417, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.895, 'learning_rate': 7e-07, 'epoch': 3.0}                           
{'loss': 2.8706, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8572, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8445, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8376, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|███████████████████| 10517/10517 [00:00<00:00, 27957.56 examples/s]
Running tokenizer on train dataset: 100%|█| 10391/10391 [00:00<00:00, 20666.83 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1452, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9516, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9043, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8771, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.858, 'learning_rate': 5e-07, 'epoch': 5.0}                           
{'loss': 2.8475, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8414, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|█████████████████████| 8637/8637 [00:00<00:00, 28058.27 examples/s]
Running tokenizer on train dataset: 100%|█| 8538/8538 [00:00<00:00, 20734.63 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1632, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9621, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9084, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8802, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8643, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8537, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8499, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|█████████████████████| 6555/6555 [00:00<00:00, 28287.52 examples/s]
Running tokenizer on train dataset: 100%|█| 6477/6477 [00:00<00:00, 20603.65 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1816, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9807, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9273, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.9004, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8779, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8659, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8583, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|█████████████████████| 4218/4218 [00:00<00:00, 27364.73 examples/s]
Running tokenizer on train dataset: 100%|█| 4179/4179 [00:00<00:00, 20736.71 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.2364, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 3.0313, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9637, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.9227, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8959, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8852, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8723, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

First image: ../data/a photo of standard bathroom/IMG-C5471456_22.jpg, caption: 'a photo of standard bathroom'
Filter: 100%|█████████████████████| 1435/1435 [00:00<00:00, 25075.73 examples/s]
Running tokenizer on train dataset: 100%|█| 1422/1422 [00:00<00:00, 20741.93 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.3239, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 3.1556, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 3.0773, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 3.0223, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 3.0051, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.9641, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.9518, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 2.

In [13]:
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import pandas as pd

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def calculate_category_accuracy(true_labels, predicted_labels):
    """Calculate the accuracy for each category and return it as a dictionary."""
    accuracies = {}
    true_labels_counter = Counter(true_labels)
    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])
    
    for label in true_labels_counter:
        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0
        accuracies[label] = accuracy
    
    return accuracies
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
BATCH_SIZE = 128  # Adjust based on your available memory
results = []  # List to hold accuracy results

# Loop for each fold and each threshold
for fold in range(2):
    for idx, threshold in enumerate(np.arange(0.91, 1.00, 0.01)):
        model_dir = f"./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold{fold}_threshold{idx+1}"
        test_json = f'val_fold_{fold}.json'  # Test JSON file for the current fold
        print(f"Evaluating with model from {model_dir}...")

        # Load the JSON data for testing
        with open(test_json, 'r') as f:
            data = [json.loads(line) for line in f]

        # Extract image paths and labels
        image_paths = [item['image'] for item in data]
        labels = [item['caption'] for item in data]

        # Initialize model components
        image_processor = AutoImageProcessor.from_pretrained(repo_id)
        tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
        model = AutoModel.from_pretrained(model_dir)
        clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                                 device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(model_dir))

        all_predictions = []
        all_true_labels = []

        # Process images in batches
        for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
            batch_images = []
            valid_labels = []
            for path, label in zip(batch_paths, batch_labels):
                try:
                    batch_images.append(Image.open(path))
                    valid_labels.append(label)
                except (FileNotFoundError, UnidentifiedImageError):
                    continue  # Skip images that cannot be opened

            # Get predictions for the batch of images
            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

            all_predictions.extend(predicted_labels)
            all_true_labels.extend(valid_labels)


        # Calculate accuracy
        correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
        accuracy = correct_predictions / len(all_true_labels)
        print(f"Accuracy for model on fold {fold} with threshold {threshold:.2f}: {accuracy * 100:.2f}%\n")

        # Calculate the accuracy for each category
        category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)
        for category, acc in category_accuracies.items():
            results.append({
                'Fold': fold,
                'Threshold': f">{threshold:.2f}",
                'Category': category,
                'Accuracy': acc
            })
        print("--\nDONE\n")

# Create DataFrame from results and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('./result/ours_results.csv', index=False)
print("All results saved to model_evaluation_results.csv")



Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold1...




Accuracy for model on fold 0 with threshold 0.91: 96.96%

--
DONE

Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold2...
Accuracy for model on fold 0 with threshold 0.92: 97.10%

--
DONE

Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold3...
Accuracy for model on fold 0 with threshold 0.93: 96.72%

--
DONE

Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold4...
Accuracy for model on fold 0 with threshold 0.94: 96.40%

--
DONE

Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold5...
Accuracy for model on fold 0 with threshold 0.95: 96.40%

--
DONE

Evaluating with model from ./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold0_threshold6...
Accuracy for model on fold 0 with threshold 0.96: 95.84%

--
DONE

Evaluating with model from ./workspace/output/backup_ckpts

In [9]:
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import pandas as pd

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def calculate_category_accuracy(true_labels, predicted_labels):
    """Calculate the accuracy for each category and return it as a dictionary."""
    accuracies = {}
    true_labels_counter = Counter(true_labels)
    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])
    
    for label in true_labels_counter:
        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0
        accuracies[label] = accuracy
    
    return accuracies

BATCH_SIZE = 128  # Adjust based on your available memory
results = []

# Loop for each fold and each threshold
for fold in range(5):
    for threshold in np.arange(0.90, 1.00, 0.01):
        model_dir = f"./workspace/output/laion-finetuned_v5e7_epoch10_fold{fold}_threshold{int(threshold*100)}"
        test_json = f'val_fold_{fold}.json'  # Test JSON file for the current fold
        print(f"Evaluating with model from {model_dir}...")

        # Load the JSON data for testing
        with open(test_json, 'r') as f:
            data = [json.loads(line) for line in f]

        # Extract image paths and labels
        image_paths = [item['image'] for item in data]
        labels = [item['caption'] for item in data]

        # Initialize model components
        image_processor = AutoImageProcessor.from_pretrained(model_dir)
        tokenizer = AutoTokenizer.from_pretrained(model_dir, config=AutoConfig.from_pretrained(model_dir))
        model = AutoModel.from_pretrained(model_dir)
        clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                                 device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(model_dir))

        all_predictions = []
        all_true_labels = []

        # Process images in batches
        for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
            batch_images = []
            valid_labels = []
            for path, label in zip(batch_paths, batch_labels):
                try:
                    batch_images.append(Image.open(path))
                    valid_labels.append(label)
                except (FileNotFoundError, UnidentifiedImageError):
                    continue  # Skip images that cannot be opened

            # Get predictions for the batch of images
            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

            all_predictions.extend(predicted_labels)
            all_true_labels.extend(valid_labels)

        # Calculate accuracy
        correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
        accuracy = correct_predictions / len(all_true_labels)
        print(f"Accuracy for model on fold {fold} with threshold {threshold:.2f}: {accuracy * 100:.2f}%\n")

        # Calculate the accuracy for each category
        category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)
        for category, acc in category_accuracies.items():
            results.append({
                'Fold': fold,
                'Threshold': f">{threshold:.2f}",
                'Category': category,
                'Accuracy': acc
            })
        print("--\nDONE\n")

# Create DataFrame from results and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('./result/ours_results.csv', index=False)
print("All results saved to model_evaluation_results.csv")

# change to org test and add all predict details into df, future may generate con-mat

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain1...
Accuracy for model 1: 97.08%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain2...
Accuracy for model 2: 97.17%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain3...
Accuracy for model 3: 98.20%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain4...
Accuracy for model 4: 96.79%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain5...
Accuracy for model 5: 96.75%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain6...
Accuracy for model 6: 96.33%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain7...
Accuracy for model 7: 95.81%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain8...
Accuracy for model 8: 94.56%



# read data directly from csv 0.95 as test

In [22]:
import pandas as pd
import json

def prepare_data_from_dataframe(df: pd.DataFrame, output_file: str) -> str:
    """Prepare data for finetuning by reading from a DataFrame."""
    data = []

    # Process data
    for _, row in df.iterrows():
        image_path = f"../data/{row['max_key']}/{row['path'].split('/')[-1]}"
        data.append({"image": image_path, "caption": row['max_key']})

    # Save the data in JSON format
    with open(output_file, "w") as f:
        for item in data:
            json.dump(item, f)
            f.write("\n")

    return output_file


# Load the CSV file
df = pd.read_csv('labels.csv')
df = df[df['max_key'] != 'error']
df = df[df['max_key'] != 'a photo of other indoor space: not kitchen, not bathroom, not living room, not dining room, not foyer']
df = df[df['max_key'] != 'it is a artificial photo']
df = df[df['max_key'] != 'a photo of outdoor space']

# Filter samples with max_value > 0.9
test_df = df[df['max_value'] == 0.95]
threshold_df = df[(df['max_value'] > 0.9) & (~df.index.isin(test_df.index))]


test_json = prepare_data_from_dataframe(test_df, 'val.json')


In [23]:
import numpy as np
# Finetune base model
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
batch_size = 64
num_train_epochs = 100

transformers.utils.logging.set_verbosity_error()

# Loop for different thresholds
for i, threshold in enumerate(np.arange(0.91, 1.0, 0.01), start=1):
    if threshold is 0.95:
        continue
    threshold_df = df[df['max_value'] > threshold]
    threshold_df = threshold_df[threshold_df['max_value'] != 0.95]
    train_json = f'train{i}.json'
    prepare_data_from_dataframe(threshold_df, train_json)

    output_folder = f"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}"
    print(f"Finetuning {repo_id} for threshold > {threshold:.2f}, saving output to {output_folder}.")
    data_files = {'train': train_json, 'validation': test_json}
    dataset = load_dataset("json", data_files=data_files)
    print(f"first image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'")

    !python huggingface_finetune_clip.py \
        --output_dir {output_folder} --model_name_or_path {repo_id} \
        --train_file {train_json} \
        --validation_file {test_json} \
        --image_column image \
        --overwrite_output_dir=True \
        --max_seq_length=77 \
        --num_train_epochs=10 \
        --save_total_limit=5 \
        --caption_column caption \
        --remove_unused_columns=False \
        --do_train \
        --logging_strategy="epoch"\
        --per_device_train_batch_size=128 \
        --dataloader_drop_last=True\
        --learning_rate="1e-6" --warmup_steps="0" --weight_decay 0.1 
    print(f"--\nDONE. If it worked, trained data should be in {output_folder}\n")

  if threshold is 0.95:


Finetuning laion/CLIP-ViT-B-32-laion2B-s34B-b79K for threshold > 0.91, saving output to ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1.


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 19011/19011 [00:00<00:00, 28447.70 examples/s]
Running tokenizer on train dataset: 100%|█| 18786/18786 [00:00<00:00, 20427.73 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.097, 'learning_rate': 9e-07, 'epoch': 1.0}                           
{'loss': 2.9239, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8788, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.857, 'learning_rate': 6e-07, 'epoch': 4.0}                           
{'loss': 2.8415, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8333, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8244, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 17132/17132 [00:00<00:00, 28605.44 examples/s]
Running tokenizer on train dataset: 100%|█| 16929/16929 [00:00<00:00, 21019.66 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1073, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9212, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8777, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8543, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8421, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8321, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8247, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 15134/15134 [00:00<00:00, 28682.00 examples/s]
Running tokenizer on train dataset: 100%|█| 14956/14956 [00:00<00:00, 20926.65 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.104, 'learning_rate': 9e-07, 'epoch': 1.0}                           
{'loss': 2.9244, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8833, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8568, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8425, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8339, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.828, 'learning_rate': 3e-07, 'epoch': 7.0}                           
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 13150/13150 [00:00<00:00, 28607.09 examples/s]
Running tokenizer on train dataset: 100%|█| 12995/12995 [00:00<00:00, 20773.55 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1277, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9397, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8939, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8678, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8549, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8444, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8377, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 13150/13150 [00:00<00:00, 28708.52 examples/s]
Running tokenizer on train dataset: 100%|█| 12995/12995 [00:00<00:00, 20854.43 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1277, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9397, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.8939, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.8678, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8549, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8444, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8377, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|███████████████████| 10782/10782 [00:00<00:00, 28008.46 examples/s]
Running tokenizer on train dataset: 100%|█| 10658/10658 [00:00<00:00, 20642.58 e
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1348, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9474, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.896, 'learning_rate': 7e-07, 'epoch': 3.0}                           
{'loss': 2.8752, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.86, 'learning_rate': 5e-07, 'epoch': 5.0}                            
{'loss': 2.8464, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.84, 'learning_rate': 3e-07, 'epoch': 7.0}                            
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|█████████████████████| 8144/8144 [00:00<00:00, 27990.81 examples/s]
Running tokenizer on train dataset: 100%|█| 8047/8047 [00:00<00:00, 20525.77 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1559, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9624, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9137, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.887, 'learning_rate': 6e-07, 'epoch': 4.0}                           
{'loss': 2.8703, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8645, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8537, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|█████████████████████| 5197/5197 [00:00<00:00, 27343.36 examples/s]
Running tokenizer on train dataset: 100%|█| 5150/5150 [00:00<00:00, 20052.73 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.1931, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 2.9951, 'learning_rate': 8e-07, 'epoch': 2.0}                          
{'loss': 2.9409, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.9079, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.8858, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.8681, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.8644, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

first image: ../data/a photo of contemporary foyer/IMG-C5471460_10.jpg, caption: 'a photo of contemporary foyer'
Filter: 100%|█████████████████████| 1731/1731 [00:00<00:00, 25087.04 examples/s]
Running tokenizer on train dataset: 100%|█| 1715/1715 [00:00<00:00, 20083.96 exa
 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)
{'loss': 3.2934, 'learning_rate': 9e-07, 'epoch': 1.0}                          
{'loss': 3.093, 'learning_rate': 8e-07, 'epoch': 2.0}                           
{'loss': 3.0367, 'learning_rate': 7e-07, 'epoch': 3.0}                          
{'loss': 2.9903, 'learning_rate': 6e-07, 'epoch': 4.0}                          
{'loss': 2.9514, 'learning_rate': 5e-07, 'epoch': 5.0}                          
{'loss': 2.9515, 'learning_rate': 4e-07, 'epoch': 6.0}                          
{'loss': 2.9336, 'learning_rate': 3e-07, 'epoch': 7.0}                          
{'loss': 

In [24]:
# 95
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Load the JSON data
with open('val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 128  # Adjust based on your available memory
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"

# Loop for different finetuned models
for i in range(1, 10):  # Assuming you have 9 finetuned models
    dir = f"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}"
    print(f"Evaluating with model from {dir}...")

    image_processor = AutoImageProcessor.from_pretrained(repo_id)
    tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
    model = AutoModel.from_pretrained(dir)
    clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                             device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))

    all_predictions = []
    all_true_labels = []

    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
        batch_images = []
        valid_labels = []
        for path, label in zip(batch_paths, batch_labels):
            try:
                batch_images.append(Image.open(path))
                valid_labels.append(label)
            except (FileNotFoundError, UnidentifiedImageError):
                continue  # Skip images that cannot be opened

        # Get predictions for the batch of images
        predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
        predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

        all_predictions.extend(predicted_labels)
        all_true_labels.extend(valid_labels)

    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
    accuracy = correct_predictions / len(all_true_labels)
    print(f"Accuracy for model {i}: {accuracy * 100:.2f}%\n")


Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1...




Accuracy for model 1: 96.96%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2...
Accuracy for model 2: 97.10%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3...
Accuracy for model 3: 96.72%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4...
Accuracy for model 4: 96.40%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5...
Accuracy for model 5: 96.40%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6...
Accuracy for model 6: 95.84%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7...
Accuracy for model 7: 95.79%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8...
Accuracy for model 8: 94.62%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain9...
Accuracy for model 9: 93.12%



In [25]:
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Load the JSON data
with open('val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 128  # Adjust based on your available memory
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"


image_processor = AutoImageProcessor.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
model = AutoModel.from_pretrained(repo_id)
clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))

all_predictions = []
all_true_labels = []

for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
    batch_images = []
    valid_labels = []
    for path, label in zip(batch_paths, batch_labels):
        try:
            batch_images.append(Image.open(path))
            valid_labels.append(label)
        except (FileNotFoundError, UnidentifiedImageError):
            continue  # Skip images that cannot be opened

    # Get predictions for the batch of images
    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

    all_predictions.extend(predicted_labels)
    all_true_labels.extend(valid_labels)

correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
accuracy = correct_predictions / len(all_true_labels)
print(f"Accuracy for model org: {accuracy * 100:.2f}%\n")

Accuracy for model 9: 83.15%



# test on five room task

In [44]:
import os
import json
import os
import pathlib
from typing import Generator
from collections import defaultdict
import datasets
from datasets import load_dataset
import sys

def collect_images_from_directory(directory: str) -> dict:
    """Collect images from a specified directory and group them by label."""
    images_per_label = defaultdict(list)
    
    subfolders = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
    for subfolder in subfolders:
        label = subfolder
        subfolder_path = os.path.join(directory, subfolder)
        for filename in os.listdir(subfolder_path):
            if filename.endswith('.jpg') or filename.endswith('.png'):
                images_per_label[label].append(os.path.join(subfolder_path, filename))
                
    return images_per_label

def prepare_data_for_finetuning(train_dir: str, test_dir: str) -> tuple:
    """Prepare data for finetuning by reading images from specified train and test directories."""
    train_data = []
    val_data = []

    # Collect training images
    train_images_per_label = collect_images_from_directory(train_dir)
    for label, images in train_images_per_label.items():
        train_data.extend([{"image": img, "caption": label} for img in images])

    # Collect testing images
    test_images_per_label = collect_images_from_directory(test_dir)
    for label, images in test_images_per_label.items():
        val_data.extend([{"image": img, "caption": label} for img in images])

    # Save the data in JSON format in the code directory
    train_file = "room_train.json"
    val_file = "room_val.json"
    with open(train_file, "w") as f:
        for item in train_data:
            json.dump(item, f)
            f.write("\n")
    with open(val_file, "w") as f:
        for item in val_data:
            json.dump(item, f)
            f.write("\n")
    
    return train_file, val_file

# Usage:
train_json, test_json = prepare_data_for_finetuning("../room_5", "../room_5")
data_files = {'train': train_json, 'validation': test_json}

# test loading it back in

dataset = load_dataset("json", data_files=data_files)
print(f"first image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'")

print(sys.executable)
!which pip3
dataset['validation']

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

first image: ../room_5/a photo of kitchen/gsun_1c5c2085ba8b7d4a176739aab0998c8d.jpg, caption: 'a photo of kitchen'
/home/haojin/anaconda3/envs/huggingface/bin/python
/home/haojin/anaconda3/envs/huggingface/bin/pip3


Dataset({
    features: ['image', 'caption'],
    num_rows: 102625
})

In [45]:
# test on original ckpt
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Load the JSON data
with open('room_val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 128  # Adjust based on your available memory
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"


image_processor = AutoImageProcessor.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
model = AutoModel.from_pretrained(repo_id)
clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))

all_predictions = []
all_true_labels = []

for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
    batch_images = []
    valid_labels = []
    for path, label in zip(batch_paths, batch_labels):
        try:
            batch_images.append(Image.open(path))
            valid_labels.append(label)
        except (FileNotFoundError, UnidentifiedImageError):
            continue  # Skip images that cannot be opened

    # Get predictions for the batch of images
    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

    all_predictions.extend(predicted_labels)
    all_true_labels.extend(valid_labels)

correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
accuracy = correct_predictions / len(all_true_labels)
print(f"Accuracy for model: {accuracy * 100:.2f}%\n")

Accuracy for model: 99.97%



In [46]:
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Load the JSON data
with open('room_val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 128  # Adjust based on your available memory
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"

# Loop for different finetuned models
for i in range(1, 10):  # Assuming you have 9 finetuned models
    dir = f"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}"
    print(f"Evaluating with model from {dir}...")

    image_processor = AutoImageProcessor.from_pretrained(repo_id)
    tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
    model = AutoModel.from_pretrained(dir)
    clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                             device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))

    all_predictions = []
    all_true_labels = []

    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
        batch_images = []
        valid_labels = []
        for path, label in zip(batch_paths, batch_labels):
            try:
                batch_images.append(Image.open(path))
                valid_labels.append(label)
            except (FileNotFoundError, UnidentifiedImageError):
                continue  # Skip images that cannot be opened

        # Get predictions for the batch of images
        predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
        predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

        all_predictions.extend(predicted_labels)
        all_true_labels.extend(valid_labels)

    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
    accuracy = correct_predictions / len(all_true_labels)
    print(f"Accuracy for model {i}: {accuracy * 100:.2f}%\n")


Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1...
Accuracy for model 1: 99.96%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2...
Accuracy for model 2: 99.96%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3...
Accuracy for model 3: 99.96%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4...
Accuracy for model 4: 99.96%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5...
Accuracy for model 5: 99.96%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6...
Accuracy for model 6: 99.96%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7...
Accuracy for model 7: 99.97%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8...
Accuracy for model 8: 99.97%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch

# test on orginal dataset

In [2]:
import os
import json
import os
import pathlib
from typing import Generator
from collections import defaultdict
import datasets
from datasets import load_dataset
import sys

def collect_images_from_directory(directory: str) -> dict:
    """Collect images from a specified directory and group them by label."""
    images_per_label = defaultdict(list)
    
    subfolders = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
    for subfolder in subfolders:
        label = subfolder
        subfolder_path = os.path.join(directory, subfolder)
        for filename in os.listdir(subfolder_path):
            if filename.endswith('.jpg') or filename.endswith('.png'):
                images_per_label[label].append(os.path.join(subfolder_path, filename))
                
    return images_per_label

def prepare_data_for_finetuning(train_dir: str, test_dir: str) -> tuple:
    """Prepare data for finetuning by reading images from specified train and test directories."""
    train_data = []
    val_data = []

    # Collect training images
    train_images_per_label = collect_images_from_directory(train_dir)
    for label, images in train_images_per_label.items():
        train_data.extend([{"image": img, "caption": label} for img in images])

    # Collect testing images
    test_images_per_label = collect_images_from_directory(test_dir)
    for label, images in test_images_per_label.items():
        val_data.extend([{"image": img, "caption": label} for img in images])

    # Save the data in JSON format in the code directory
    train_file = "kb_train.json"
    val_file = "kb_val.json"
    with open(train_file, "w") as f:
        for item in train_data:
            json.dump(item, f)
            f.write("\n")
    with open(val_file, "w") as f:
        for item in val_data:
            json.dump(item, f)
            f.write("\n")
    
    return train_file, val_file

# Usage:
train_json, test_json = prepare_data_for_finetuning("../class_4", "../class_4")
data_files = {'train': train_json, 'validation': test_json}

# test loading it back in

dataset = load_dataset("json", data_files=data_files)
print(f"first image: {dataset['validation'][0]['image']}, caption: '{dataset['validation'][0]['caption']}'")

print(sys.executable)
!which pip3
dataset['validation']

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

first image: ../class_4/a photo of standard bathroom/IMG-C5472473_15.jpg, caption: 'a photo of standard bathroom'
/home/haojin/anaconda3/envs/huggingface/bin/python
/home/haojin/anaconda3/envs/huggingface/bin/pip3


Dataset({
    features: ['image', 'caption'],
    num_rows: 752
})

In [3]:
# test on original ckpt
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Load the JSON data
with open('kb_val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 128  # Adjust based on your available memory
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"


image_processor = AutoImageProcessor.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
model = AutoModel.from_pretrained(repo_id)
clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))

all_predictions = []
all_true_labels = []

for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
    batch_images = []
    valid_labels = []
    for path, label in zip(batch_paths, batch_labels):
        try:
            batch_images.append(Image.open(path))
            valid_labels.append(label)
        except (FileNotFoundError, UnidentifiedImageError):
            continue  # Skip images that cannot be opened

    # Get predictions for the batch of images
    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

    all_predictions.extend(predicted_labels)
    all_true_labels.extend(valid_labels)

correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
accuracy = correct_predictions / len(all_true_labels)
print(f"Accuracy for model: {accuracy * 100:.2f}%\n")



Accuracy for model: 99.87%



In [4]:
# accuracy per class
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Load the JSON data
with open('kb_val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 128  # Adjust based on your available memory
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"

image_processor = AutoImageProcessor.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
model = AutoModel.from_pretrained(repo_id)
clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))

all_predictions = []
all_true_labels = []

class_accuracy = {}  # Dictionary to track accuracy per class

for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
    batch_images = []
    valid_labels = []
    for path, label in zip(batch_paths, batch_labels):
        try:
            batch_images.append(Image.open(path))
            valid_labels.append(label)
        except (FileNotFoundError, UnidentifiedImageError):
            continue  # Skip images that cannot be opened

    # Get predictions for the batch of images
    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

    all_predictions.extend(predicted_labels)
    all_true_labels.extend(valid_labels)

    # Update class accuracy counts
    for true_label, predicted_label in zip(valid_labels, predicted_labels):
        if true_label not in class_accuracy:
            class_accuracy[true_label] = {'correct': 0, 'total': 0}
        class_accuracy[true_label]['total'] += 1
        if true_label == predicted_label:
            class_accuracy[true_label]['correct'] += 1

# Print accuracy per class and calculate mean accuracy
mean_accuracy = 0
for class_label, counts in class_accuracy.items():
    class_acc = counts['correct'] / counts['total']
    mean_accuracy += class_acc
    print(f"Accuracy for class '{class_label}': {class_acc * 100:.2f}%")
mean_accuracy /= len(class_accuracy)
print(f"\nMean accuracy over all classes: {mean_accuracy * 100:.2f}%\n")




Accuracy for class 'a photo of standard bathroom': 100.00%
Accuracy for class 'a photo of standard kitchen': 100.00%
Accuracy for class 'a photo of contemporary bathroom': 99.15%
Accuracy for class 'a photo of contemporary kitchen': 100.00%

Mean accuracy over all classes: 99.79%



In [5]:
# accuracy per class
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Load the JSON data
with open('kb_val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 128  # Adjust based on your available memory
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
dir = "./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2"
image_processor = AutoImageProcessor.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
model = AutoModel.from_pretrained(dir)
clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))

all_predictions = []
all_true_labels = []

class_accuracy = {}  # Dictionary to track accuracy per class

for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
    batch_images = []
    valid_labels = []
    for path, label in zip(batch_paths, batch_labels):
        try:
            batch_images.append(Image.open(path))
            valid_labels.append(label)
        except (FileNotFoundError, UnidentifiedImageError):
            continue  # Skip images that cannot be opened

    # Get predictions for the batch of images
    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

    all_predictions.extend(predicted_labels)
    all_true_labels.extend(valid_labels)

    # Update class accuracy counts
    for true_label, predicted_label in zip(valid_labels, predicted_labels):
        if true_label not in class_accuracy:
            class_accuracy[true_label] = {'correct': 0, 'total': 0}
        class_accuracy[true_label]['total'] += 1
        if true_label == predicted_label:
            class_accuracy[true_label]['correct'] += 1

# Print accuracy per class and calculate mean accuracy
mean_accuracy = 0
for class_label, counts in class_accuracy.items():
    class_acc = counts['correct'] / counts['total']
    mean_accuracy += class_acc
    print(f"Accuracy for class '{class_label}': {class_acc * 100:.2f}%")
mean_accuracy /= len(class_accuracy)
print(f"\nMean accuracy over all classes: {mean_accuracy * 100:.2f}%\n")




Accuracy for class 'a photo of standard bathroom': 100.00%
Accuracy for class 'a photo of standard kitchen': 100.00%
Accuracy for class 'a photo of contemporary bathroom': 99.15%
Accuracy for class 'a photo of contemporary kitchen': 100.00%

Mean accuracy over all classes: 99.79%



In [6]:
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Load the JSON data
with open('kb_val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 128  # Adjust based on your available memory
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"

# Loop for different finetuned models
for i in range(1, 10):  # Assuming you have 9 finetuned models
    dir = f"./workspace/output/laion-finetuned_v5e7_epoch10_vtrain{i}"
    print(f"Evaluating with model from {dir}...")

    image_processor = AutoImageProcessor.from_pretrained(repo_id)
    tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
    model = AutoModel.from_pretrained(dir)
    clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                             device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(dir))

    all_predictions = []
    all_true_labels = []

    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
        batch_images = []
        valid_labels = []
        for path, label in zip(batch_paths, batch_labels):
            try:
                batch_images.append(Image.open(path))
                valid_labels.append(label)
            except (FileNotFoundError, UnidentifiedImageError):
                continue  # Skip images that cannot be opened

        # Get predictions for the batch of images
        predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
        predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

        all_predictions.extend(predicted_labels)
        all_true_labels.extend(valid_labels)

    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
    accuracy = correct_predictions / len(all_true_labels)
    print(f"Accuracy for model {i}: {accuracy * 100:.2f}%\n")




Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain1...




Accuracy for model 1: 99.73%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2...




Accuracy for model 2: 99.87%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain3...




Accuracy for model 3: 99.87%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain4...




Accuracy for model 4: 99.87%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain5...




Accuracy for model 5: 99.87%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain6...




Accuracy for model 6: 99.87%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain7...




Accuracy for model 7: 99.87%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain8...




Accuracy for model 8: 99.87%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_vtrain9...
Accuracy for model 9: 99.87%



# quantization small model

In [None]:
import os
import torch

# Make only the first GPU visible (GPU 0)
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    print(f"CUDA is available. Number of GPUs: {torch.cuda.device_count()}")
    
    # Loop through and print details of each GPU
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available. Only CPU will be used.")


In [2]:
from accelerate.utils import BnbQuantizationConfig
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)
from accelerate.utils import load_and_quantize_model
from transformers import CLIPProcessor, CLIPModel, AutoModel
from accelerate import init_empty_weights
from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification
config = AutoConfig.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
with init_empty_weights():
    empty_model = AutoModelForZeroShotImageClassification.from_config(config)
    #CLIP(config=AutoConfig.from_pretrained("openai/clip-vit-large-patch14"))

# Move the model to GPU 0
print(empty_model.device)
dir = "./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2"
model = AutoModel.from_pretrained(dir)
from huggingface_hub import snapshot_download
weights_location = "./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2/model.safetensors"
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map = "auto")

meta


In [3]:
import os
import csv
from transformers import CLIPProcessor, CLIPModel, pipeline, CLIPImageProcessor
from PIL import Image
from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoImageProcessor
import matplotlib.pyplot as plt
import time
import json

# Load the JSON data
with open('kb_val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

repo_id =  "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
dir = "./workspace/output/laion-finetuned_v5e7_epoch10_vtrain2"
image_processor = AutoImageProcessor.from_pretrained(
    repo_id
)
tokenizer = AutoTokenizer.from_pretrained(repo_id, 
                                          config=AutoConfig.from_pretrained(repo_id))
model = quantized_model
clip_pipeline = pipeline(model=model,task="zero-shot-image-classification", tokenizer=tokenizer,
                    image_processor=image_processor, config=AutoConfig.from_pretrained(dir),
                        device_map="auto", model_kwargs={"load_in_8bit": True})

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

BATCH_SIZE = 256  # Adjust based on your available memory

all_predictions = []
all_true_labels = []
time_eval = []

for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
    batch_images = []
    valid_labels = []
    for path, label in zip(batch_paths, batch_labels):
        try:
            batch_images.append(Image.open(path))
            valid_labels.append(label)
        except (FileNotFoundError, UnidentifiedImageError):
            continue  # Skip images that cannot be opened

    # Get predictions for the batch of images
    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

    all_predictions.extend(predicted_labels)
    all_true_labels.extend(valid_labels)

correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
accuracy = correct_predictions / len(all_true_labels)
print(f"Accuracy for model {i}: {accuracy * 100:.2f}%\n")




Accuracy for model 0: 96.81%



## random testing result

### original model

In [4]:
from accelerate.utils import BnbQuantizationConfig
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)
from accelerate.utils import load_and_quantize_model
from transformers import CLIPProcessor, CLIPModel, AutoModel
from accelerate import init_empty_weights
from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification
config = AutoConfig.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
with init_empty_weights():
    empty_model = AutoModelForZeroShotImageClassification.from_config(config)
    #CLIP(config=AutoConfig.from_pretrained("openai/clip-vit-large-patch14"))

# Move the model to GPU 0
print(empty_model.device)
model = AutoModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
from huggingface_hub import snapshot_download
weights_location = snapshot_download(repo_id="laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, 
                                          bnb_quantization_config=bnb_quantization_config, device_map = "auto")



meta


Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

In [5]:
import os
import csv
from transformers import CLIPProcessor, CLIPModel, pipeline, CLIPImageProcessor
from PIL import Image
from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoImageProcessor
import matplotlib.pyplot as plt
import time
import json

# Load the JSON data
with open('val_random.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

repo_id =  "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"

image_processor = AutoImageProcessor.from_pretrained(
    repo_id
)
tokenizer = AutoTokenizer.from_pretrained(repo_id, 
                                          config=AutoConfig.from_pretrained(repo_id))
model = quantized_model
clip_pipeline = pipeline(model=model,task="zero-shot-image-classification", tokenizer=tokenizer,
                    image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id),
                        device_map="auto", model_kwargs={"load_in_8bit": True})

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

BATCH_SIZE = 256  # Adjust based on your available memory

all_predictions = []
all_true_labels = []
time_eval = []

for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
    batch_images = []
    valid_labels = []
    for path, label in zip(batch_paths, batch_labels):
        try:
            batch_images.append(Image.open(path))
            valid_labels.append(label)
        except (FileNotFoundError, UnidentifiedImageError):
            continue  # Skip images that cannot be opened

    # Get predictions for the batch of images
    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

    all_predictions.extend(predicted_labels)
    all_true_labels.extend(valid_labels)

correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
accuracy = correct_predictions / len(all_true_labels)
print(f"Accuracy for model {i}: {accuracy * 100:.2f}%\n")




Accuracy for model 9: 85.71%



### our trained models

In [6]:
from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoConfig, AutoModelForZeroShotImageClassification, pipeline, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import json

# Load the JSON data for evaluation
with open('val_random.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 256  # Adjust based on your available memory

# Define a function to process images in chunks
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Base configuration
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)

# Iterate over the 10 models
for i in range(1, 10):  # Assuming you have 19 finetuned models
    dir = f"./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain{i}"
    print(f"Evaluating with model from {dir}...")
    
    # Load and quantize model
    with init_empty_weights():
        empty_model = AutoModelForZeroShotImageClassification.from_config(AutoConfig.from_pretrained(repo_id))

    # Load the model from the directory
    weights_location = f"{dir}/model.safetensors"
    quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map="auto")

    # Initialize tokenizer and processor
    tokenizer = AutoTokenizer.from_pretrained(repo_id)
    image_processor = AutoImageProcessor.from_pretrained(repo_id)
    
    # Initialize the pipeline without the device argument
    clip_pipeline = pipeline(
        model=quantized_model,
        task="zero-shot-image-classification",
        tokenizer=tokenizer,
        image_processor=image_processor,
        config=AutoConfig.from_pretrained(repo_id),
        model_kwargs={"load_in_8bit": True}
    )

    all_predictions = []
    all_true_labels = []

        # Process images in chunks and evaluate
    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
        batch_images = []
        valid_labels = []
        for path, label in zip(batch_paths, batch_labels):
            try:
                # Open image and append to batch_images
                with Image.open(path) as img:
                    batch_images.append(img.convert("RGB"))
                valid_labels.append(label)
            except (FileNotFoundError, UnidentifiedImageError):
                # Skip images that cannot be opened
                continue

        # Get predictions for the batch of images
        try:
            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction
            all_predictions.extend(predicted_labels)
            all_true_labels.extend(valid_labels)
        except Exception as e:
            print(f"An error occurred during prediction: {e}")
            continue

    # Calculate the accuracy for the current model
    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
    accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0
    print(f"Accuracy for model {i}: {accuracy * 100:.2f}%\n")



Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain1...




Accuracy for model 1: 97.06%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain2...




Accuracy for model 2: 97.15%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain3...




Accuracy for model 3: 98.11%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain4...




Accuracy for model 4: 96.90%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain5...




Accuracy for model 5: 96.60%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain6...




Accuracy for model 6: 96.27%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain7...




Accuracy for model 7: 95.59%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain8...




Accuracy for model 8: 94.62%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain9...




Accuracy for model 9: 92.34%



## kitchen and bathroom result

In [9]:
#original model
from accelerate.utils import BnbQuantizationConfig
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)
from accelerate.utils import load_and_quantize_model
from transformers import CLIPProcessor, CLIPModel, AutoModel
from accelerate import init_empty_weights
from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification
config = AutoConfig.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
with init_empty_weights():
    empty_model = AutoModelForZeroShotImageClassification.from_config(config)
    #CLIP(config=AutoConfig.from_pretrained("openai/clip-vit-large-patch14"))

# Move the model to GPU 0
print(empty_model.device)
model = AutoModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
from huggingface_hub import snapshot_download
weights_location = snapshot_download(repo_id="laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, 
                                          bnb_quantization_config=bnb_quantization_config, device_map = "auto")

import os
import csv
from transformers import CLIPProcessor, CLIPModel, pipeline, CLIPImageProcessor
from PIL import Image
from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoImageProcessor
import matplotlib.pyplot as plt
import time
import json

# Load the JSON data
with open('kb_val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

repo_id =  "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"

image_processor = AutoImageProcessor.from_pretrained(
    repo_id
)
tokenizer = AutoTokenizer.from_pretrained(repo_id, 
                                          config=AutoConfig.from_pretrained(repo_id))
model = quantized_model
clip_pipeline = pipeline(model=model,task="zero-shot-image-classification", tokenizer=tokenizer,
                    image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id),
                        device_map="auto", model_kwargs={"load_in_8bit": True})

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

BATCH_SIZE = 256  # Adjust based on your available memory

all_predictions = []
all_true_labels = []
time_eval = []

for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
    batch_images = []
    valid_labels = []
    for path, label in zip(batch_paths, batch_labels):
        try:
            batch_images.append(Image.open(path))
            valid_labels.append(label)
        except (FileNotFoundError, UnidentifiedImageError):
            continue  # Skip images that cannot be opened

    # Get predictions for the batch of images
    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

    all_predictions.extend(predicted_labels)
    all_true_labels.extend(valid_labels)

correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
accuracy = correct_predictions / len(all_true_labels)
print(f"Accuracy for model {i}: {accuracy * 100:.2f}%\n")


meta


Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]



Accuracy for model 0: 95.08%



In [10]:
from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoConfig, AutoModelForZeroShotImageClassification, pipeline, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import json

# Load the JSON data for evaluation
with open('kb_val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 256  # Adjust based on your available memory

# Define a function to process images in chunks
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Base configuration
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)

# Iterate over the 10 models
for i in range(1, 10):  # Assuming you have 19 finetuned models
    dir = f"./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain{i}"
    print(f"Evaluating with model from {dir}...")
    
    # Load and quantize model
    with init_empty_weights():
        empty_model = AutoModelForZeroShotImageClassification.from_config(AutoConfig.from_pretrained(repo_id))

    # Load the model from the directory
    weights_location = f"{dir}/model.safetensors"
    quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map="auto")

    # Initialize tokenizer and processor
    tokenizer = AutoTokenizer.from_pretrained(repo_id)
    image_processor = AutoImageProcessor.from_pretrained(repo_id)
    
    # Initialize the pipeline without the device argument
    clip_pipeline = pipeline(
        model=quantized_model,
        task="zero-shot-image-classification",
        tokenizer=tokenizer,
        image_processor=image_processor,
        config=AutoConfig.from_pretrained(repo_id),
        model_kwargs={"load_in_8bit": True}
    )

    all_predictions = []
    all_true_labels = []

        # Process images in chunks and evaluate
    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
        batch_images = []
        valid_labels = []
        for path, label in zip(batch_paths, batch_labels):
            try:
                # Open image and append to batch_images
                with Image.open(path) as img:
                    batch_images.append(img.convert("RGB"))
                valid_labels.append(label)
            except (FileNotFoundError, UnidentifiedImageError):
                # Skip images that cannot be opened
                continue

        # Get predictions for the batch of images
        try:
            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction
            all_predictions.extend(predicted_labels)
            all_true_labels.extend(valid_labels)
        except Exception as e:
            print(f"An error occurred during prediction: {e}")
            continue

    # Calculate the accuracy for the current model
    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
    accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0
    print(f"Accuracy for model {i}: {accuracy * 100:.2f}%\n")



Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain1...




Accuracy for model 1: 97.07%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain2...




Accuracy for model 2: 96.94%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain3...




Accuracy for model 3: 96.94%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain4...




Accuracy for model 4: 96.94%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain5...




Accuracy for model 5: 96.81%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain6...




Accuracy for model 6: 96.54%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain7...




Accuracy for model 7: 96.01%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain8...




Accuracy for model 8: 96.01%

Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain9...




Accuracy for model 9: 95.74%



# quantize 4 bit

In [24]:

from accelerate.utils import BnbQuantizationConfig
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)
from accelerate.utils import load_and_quantize_model
from transformers import CLIPProcessor, CLIPModel, AutoModel
from accelerate import init_empty_weights
from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification
config = AutoConfig.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
with init_empty_weights():
    empty_model = AutoModelForZeroShotImageClassification.from_config(config)
    #CLIP(config=AutoConfig.from_pretrained("openai/clip-vit-large-patch14"))

# Move the model to GPU 0
print(empty_model.device)
model = AutoModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
from huggingface_hub import snapshot_download
weights_location = snapshot_download(repo_id="laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, 
                                          bnb_quantization_config=bnb_quantization_config, device_map = "auto")



meta


Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

In [26]:
from accelerate.utils import BnbQuantizationConfig
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)
from accelerate.utils import load_and_quantize_model
from transformers import CLIPProcessor, CLIPModel, AutoModelForCausalLM
from accelerate import init_empty_weights
from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification
tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
model = AutoModelForCausalLM.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K", device_map="auto", load_in_4bit=True)

ValueError: Unrecognized configuration class <class 'transformers.models.clip.configuration_clip.CLIPConfig'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, LlamaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MvpConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.

In [12]:
#original model
from accelerate.utils import BnbQuantizationConfig
from accelerate.utils import load_and_quantize_model
from transformers import CLIPProcessor, CLIPModel, AutoModel
from accelerate import init_empty_weights
from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoModelForZeroShotImageClassification
config = AutoConfig.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
with init_empty_weights():
    empty_model = AutoModelForZeroShotImageClassification.from_config(config)
    #CLIP(config=AutoConfig.from_pretrained("openai/clip-vit-large-patch14"))

# Move the model to GPU 0
print(empty_model.device)
model = AutoModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
from huggingface_hub import snapshot_download
weights_location = snapshot_download(repo_id="laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, 
                                          bnb_quantization_config=bnb_quantization_config, device_map = "auto")

import os
import csv
from transformers import CLIPProcessor, CLIPModel, pipeline, CLIPImageProcessor
from PIL import Image
from transformers import AutoTokenizer, AutoConfig, AutoModel,AutoImageProcessor
import matplotlib.pyplot as plt
import time
import json

# Load the JSON data
with open('val_random.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

repo_id =  "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"

image_processor = AutoImageProcessor.from_pretrained(
    repo_id
)
tokenizer = AutoTokenizer.from_pretrained(repo_id, 
                                          config=AutoConfig.from_pretrained(repo_id))
model = quantized_model
clip_pipeline = pipeline(model=model,task="zero-shot-image-classification", tokenizer=tokenizer,
                    image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id),
                        device_map="auto", model_kwargs={"load_in_8bit": True})

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

BATCH_SIZE = 256  # Adjust based on your available memory

all_predictions = []
all_true_labels = []
time_eval = []

for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
    batch_images = []
    valid_labels = []
    for path, label in zip(batch_paths, batch_labels):
        try:
            batch_images.append(Image.open(path))
            valid_labels.append(label)
        except (FileNotFoundError, UnidentifiedImageError):
            continue  # Skip images that cannot be opened

    # Get predictions for the batch of images
    predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
    predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

    all_predictions.extend(predicted_labels)
    all_true_labels.extend(valid_labels)

correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
accuracy = correct_predictions / len(all_true_labels)
print(f"Accuracy for model {i}: {accuracy * 100:.2f}%\n")


meta


Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]



Accuracy for model 9: 88.67%



In [22]:
from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoConfig, AutoModelForZeroShotImageClassification, pipeline, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import json

# Load the JSON data for evaluation
with open('val_random.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 256  # Adjust based on your available memory

# Define a function to process images in chunks
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Base configuration
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"

# Iterate over the 10 models
for i in range(1, 10):  # Assuming you have 19 finetuned models
    dir = f"./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain{i}"
    print(f"Evaluating with model from {dir}...")
    
    # Load and quantize model
    with init_empty_weights():
        empty_model = AutoModelForZeroShotImageClassification.from_config(AutoConfig.from_pretrained(repo_id))

    # Load the model from the directory
    weights_location = f"{dir}/model.safetensors"
    quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map="auto")

    # Initialize tokenizer and processor
    tokenizer = AutoTokenizer.from_pretrained(repo_id)
    image_processor = AutoImageProcessor.from_pretrained(repo_id)
    
    # Initialize the pipeline without the device argument
    clip_pipeline = pipeline(
        model=quantized_model,
        task="zero-shot-image-classification",
        tokenizer=tokenizer,
        image_processor=image_processor,
        config=AutoConfig.from_pretrained(repo_id),
        model_kwargs={"load_in_8bit": True}
    )

    all_predictions = []
    all_true_labels = []

        # Process images in chunks and evaluate
    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
        batch_images = []
        valid_labels = []
        for path, label in zip(batch_paths, batch_labels):
            try:
                # Open image and append to batch_images
                with Image.open(path) as img:
                    batch_images.append(img.convert("RGB"))
                valid_labels.append(label)
            except (FileNotFoundError, UnidentifiedImageError):
                # Skip images that cannot be opened
                continue

        # Get predictions for the batch of images
        try:
            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction
            all_predictions.extend(predicted_labels)
            all_true_labels.extend(valid_labels)
        except Exception as e:
            print(f"An error occurred during prediction: {e}")
            continue

    # Calculate the accuracy for the current model
    correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
    accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0
    print(f"Accuracy for model {i}: {accuracy * 100:.2f}%\n")



Evaluating with model from ./workspace/output/laion-finetuned_v5e7_epoch10_random80_vtrain1...


NotImplementedError: Cannot copy out of meta tensor; no data!

# Efficiency comparison

In [1]:
import pandas as pd
from collections import Counter
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import torch
import torch.profiler

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def calculate_category_accuracy(true_labels, predicted_labels):
    """Calculate the accuracy for each category and return it as a dictionary."""
    accuracies = {}
    true_labels_counter = Counter(true_labels)
    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])
    
    for label in true_labels_counter:
        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0
        accuracies[label] = accuracy
    
    return accuracies

# Load the JSON data
with open('kb_val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 1024  # Adjust based on your available memory
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"

image_processor = AutoImageProcessor.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
model = AutoModel.from_pretrained(repo_id).cuda()
clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(repo_id))

all_predictions = []
all_true_labels = []

def trace_handler(profiler):
    print("Trace handler called")
    try:
        print(profiler.key_averages().table(sort_by="cpu_time_total", row_limit=10))
        profiler.export_chrome_trace("trace.json")
    except Exception as e:
        print(f"Error in trace handler: {str(e)}")
# Start profiling
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),
    on_trace_ready=trace_handler,
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as profiler:
    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
        batch_images = []
        valid_labels = []
        for path, label in zip(batch_paths, batch_labels):
            try:
                batch_images.append(Image.open(path).convert('RGB'))
                valid_labels.append(label)
            except (FileNotFoundError, UnidentifiedImageError):
                print(f"Skipping file: {path}, unable to open or not found.")
                continue  # Skip images that cannot be opened

        if batch_images:  # Ensure there are images to predict
            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

            all_predictions.extend(predicted_labels)
            all_true_labels.extend(valid_labels)

        profiler.step()  # Advance the profiler

# Calculate and print overall accuracy
correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0
print(f"Accuracy for model org: {accuracy * 100:.2f}%\n")

# Calculate and print category accuracies
category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)
print("Accuracy for each category:")
for category, accuracy in category_accuracies.items():
    print(f"{category}: {accuracy:.2f}")

# Convert the dictionary to a DataFrame
category_accuracy_df = pd.DataFrame(list(category_accuracies.items()), columns=['Category', 'Accuracy'])

print(category_accuracy_df)




Accuracy for model org: 73.67%

Accuracy for each category:
a photo of standard bathroom: 0.92
a photo of standard kitchen: 0.98
a photo of contemporary bathroom: 0.52
a photo of contemporary kitchen: 0.39
                           Category  Accuracy
0      a photo of standard bathroom  0.920690
1       a photo of standard kitchen  0.980892
2  a photo of contemporary bathroom  0.516949
3   a photo of contemporary kitchen  0.385027


STAGE:2024-04-22 17:39:10 514141:514141 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2024-04-22 17:39:10 514141:514141 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2024-04-22 17:39:10 514141:514141 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [None]:
import pandas as pd
from collections import Counter
import os
import json
from transformers import pipeline, AutoTokenizer, AutoConfig, AutoModel, AutoImageProcessor
from PIL import Image, UnidentifiedImageError
import torch
import torch.profiler

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def calculate_category_accuracy(true_labels, predicted_labels):
    """Calculate the accuracy for each category and return it as a dictionary."""
    accuracies = {}
    true_labels_counter = Counter(true_labels)
    correct_predictions_counter = Counter([true for true, pred in zip(true_labels, predicted_labels) if true == pred])
    
    for label in true_labels_counter:
        accuracy = (correct_predictions_counter[label] / true_labels_counter[label]) if label in correct_predictions_counter else 0
        accuracies[label] = accuracy
    
    return accuracies

# Load the JSON data
with open('kb_val.json', 'r') as f:
    data = [json.loads(line) for line in f]

# Extract image paths and labels
image_paths = [item['image'] for item in data]
labels = [item['caption'] for item in data]

BATCH_SIZE = 1024  # Adjust based on your available memory
repo_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
model_dir = f"./workspace/output/backup_ckpts/laion-finetuned_v5e7_epoch10_fold1_threshold3"

image_processor = AutoImageProcessor.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id, config=AutoConfig.from_pretrained(repo_id))
model = AutoModel.from_pretrained(model_dir).cuda()
clip_pipeline = pipeline(model=model, task="zero-shot-image-classification", tokenizer=tokenizer,
                         device=1, image_processor=image_processor, config=AutoConfig.from_pretrained(model_dir))

all_predictions = []
all_true_labels = []

def trace_handler(profiler):
    print("Trace handler called")
    try:
        print(profiler.key_averages().table(sort_by="cpu_time_total", row_limit=10))
        profiler.export_chrome_trace("trace.json")
    except Exception as e:
        print(f"Error in trace handler: {str(e)}")
# Start profiling
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),
    on_trace_ready=trace_handler,
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as profiler:
    for batch_paths, batch_labels in zip(chunks(image_paths, BATCH_SIZE), chunks(labels, BATCH_SIZE)):
        batch_images = []
        valid_labels = []
        for path, label in zip(batch_paths, batch_labels):
            try:
                batch_images.append(Image.open(path).convert('RGB'))
                valid_labels.append(label)
            except (FileNotFoundError, UnidentifiedImageError):
                print(f"Skipping file: {path}, unable to open or not found.")
                continue  # Skip images that cannot be opened

        if batch_images:  # Ensure there are images to predict
            predictions = clip_pipeline(images=batch_images, candidate_labels=valid_labels)
            predicted_labels = [pred[0]['label'] for pred in predictions]  # Top prediction

            all_predictions.extend(predicted_labels)
            all_true_labels.extend(valid_labels)

        profiler.step()  # Advance the profiler

# Calculate and print overall accuracy
correct_predictions = sum([true == pred for true, pred in zip(all_true_labels, all_predictions)])
accuracy = correct_predictions / len(all_true_labels) if all_true_labels else 0
print(f"Accuracy for model org: {accuracy * 100:.2f}%\n")

# Calculate and print category accuracies
category_accuracies = calculate_category_accuracy(all_true_labels, all_predictions)
print("Accuracy for each category:")
for category, accuracy in category_accuracies.items():
    print(f"{category}: {accuracy:.2f}")

# Convert the dictionary to a DataFrame
category_accuracy_df = pd.DataFrame(list(category_accuracies.items()), columns=['Category', 'Accuracy'])

print(category_accuracy_df)
