# **BLIP model finetuing**

**Datasets used**



*   [Control Net Deep Fashion](https://huggingface.co/datasets/ldhnam/deepfashion_controlnet)
*   [Deep Fashion with masks](https://huggingface.co/datasets/SaffalPoosh/deepFashion-with-masks)



# Install Dependences



In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install transformers
!pip install datasets
!pip install 'transformers[torch]'
!pip install 'transformers[tf-cpu]'
!pip install 'transformers[flax]'



**Testing the imports**

**Loading the datasets**

In [None]:
from datasets import load_dataset

In [None]:
saffal_possh_df = load_dataset("SaffalPoosh/deepFashion-with-masks")

In [None]:
# Checking a simple sample from the dataset
for data in saffal_possh_df.items():
  print(data[1]["gender"])
  print(data[1]["cloth_type"])
  print(data[1]["caption"])


In [None]:
control_net_deep_fashion = load_dataset("ldhnam/deepfashion_controlnet")

In [None]:
# Checking a simple sample from the dataset
for data in control_net_deep_fashion.items():
  print(data[1]["caption"])


# **Pseudo-label and Pseudo-Questions**



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")
model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")

In [None]:
from tqdm import tqdm
import numpy as np

action_key_words = ["in", "wearing", "standing", "is wearing",
                      "posing", "sitting", "walking", "carrying",
                      "leaning"]

# creating pseudo questions
def create_pseudo_questions_for_saffal_possh(data, size=200):
  dataset_selection = data["train"][0: size]
  questions = []
  answers = []
  images = []
  input_ids = []

  print("Loading the dataset..")

  sample_id = 0

  for key, sample in dataset_selection.items():
      if key == "caption":
        for caption in tqdm(sample):
          caption_tokens = caption.split(" ")
          if caption_tokens[2] in action_key_words:

            inputs = tokenizer(caption, return_tensors="pt")
            outputs = model.generate(**inputs, max_length=100)
            question_answer = tokenizer.decode(outputs[0], skip_special_tokens=False)
            question_answer = question_answer.replace(tokenizer.pad_token, "").replace(tokenizer.eos_token, "")
            question, answer = question_answer.split(tokenizer.sep_token)

            questions.append(question)
            answers.append(answer)
            input_ids.append(sample_id)
          else:
            questions.append("Is there a person in the image?")
            answers.append("Yes, there it is")
            input_ids.append(sample_id)
          sample_id += 1

  dataset_selection["questions"] = questions
  dataset_selection["answers"] = answers
  dataset_selection["input_ids"] = input_ids


  return dataset_selection

saffal_possh_df_processed = create_pseudo_questions_for_saffal_possh(saffal_possh_df)
print(saffal_possh_df_processed)

control_net_deep_fashion_processed = create_pseudo_questions_for_saffal_possh(control_net_deep_fashion)
print(control_net_deep_fashion_processed)


In [None]:
from datasets import Dataset

saffal_dataset = Dataset.from_dict(saffal_possh_df_processed)
control_net_dataset = Dataset.from_dict(control_net_deep_fashion_processed)

print(saffal_dataset)
print(control_net_dataset)

In [None]:
saffal_dataset = saffal_dataset.remove_columns(["gender", "pose", "cloth_type", "pid", "mask", "mask_overlay", "caption"])
control_net_dataset = control_net_dataset.remove_columns(["openpose", "cloth", "caption"])

In [None]:
print(saffal_dataset)
print(control_net_dataset)

In [None]:
from PIL import Image

image = saffal_dataset['images'][0]
image

In [None]:
image = control_net_dataset['image'][0]
image

**Structuring the dataset for Pytorch model train**

In [None]:
import torch

# creating the dataset structure and model train based in
# https://github.com/dino-chiio/blip-vqa-finetune/blob/main/finetuning.py

class GenericFashionDataset(torch.utils.data.Dataset):
    """VQA (v2) dataset."""

    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        question = self.dataset['questions'][idx]
        answer = self.dataset['answers'][idx]
        image_id = self.dataset['input_ids'][idx]
        try:
          image = self.dataset['images'][idx]
        except:
          image = self.dataset['image'][idx]
        text = question

        encoding = self.processor(image, text, padding="max_length", truncation=True, return_tensors="pt")
        labels = self.processor.tokenizer.encode(
            answer, max_length= 8, pad_to_max_length=True, return_tensors='pt'
        )
        encoding["labels"] = labels

        for k,v in encoding.items():  encoding[k] = v.squeeze()
        return encoding

In [None]:
from transformers import BlipProcessor, BlipForQuestionAnswering

model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
saffal_train_dataset = GenericFashionDataset(dataset=saffal_dataset, processor=processor)
control_net_train_dataset = GenericFashionDataset(dataset=control_net_dataset, processor=processor)

In [None]:
from torch.utils.data import DataLoader

batch_size = 2

saffal_train_dataloader = DataLoader(saffal_train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
control_net_train_dataloader = DataLoader(control_net_train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

# **Model Train**


deepFashion-with-masks**

In [None]:
def train_model(data_loader, num_epochs=50, patience=5):
  optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
  scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9, last_epoch=-1, verbose=False)

  information = []
  scaler = torch.cuda.amp.GradScaler()

  for epoch in range(num_epochs):
      epoch_loss = 0
      model.train()
      for idx, batch in zip(tqdm(range(len(data_loader)), desc='Training batch: ...'), data_loader):
          input_ids = batch.pop('input_ids').to(device)
          pixel_values = batch.pop('pixel_values').to(device)
          labels = batch.pop('labels').to(device)

          with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
              outputs = model(input_ids=input_ids,
                          pixel_values=pixel_values,
                          labels=labels)

          loss = outputs.loss
          epoch_loss += loss.item()
          optimizer.zero_grad()

          scaler.scale(loss).backward()
          scaler.step(optimizer)
          scaler.update()

      information.append((epoch_loss/len(saffal_train_dataloader), optimizer.param_groups[0]["lr"]))
      print("Epoch: {} - Training loss: {} - LR: {}".format(epoch+1, epoch_loss/len(saffal_train_dataloader), optimizer.param_groups[0]["lr"]))
      scheduler.step()
  return model, information


**Training a model for Saffal Dataset**

In [None]:
model, information = train_model(saffal_train_dataloader, num_epochs=1)

In [None]:
import pickle as pk

model_path = "/content/drive/MyDrive/Hvar/saffal_fashion_model.pt"
model.save_pretrained(model_path, from_pt=True) #saving in the drive

results_path = "/content/drive/MyDrive/Hvar/saffal_fashion_model_train.pkl"
pk.dump(information, open(results_path, "wb"))

**Pusing model to hugging face**

In [None]:
model_repo_name = "wiusdy/blip_pretrained_saffal_fashion_finetuning"
model.push_to_hub(model_repo_name)

**Training a dataset for Control Net dataset**

In [None]:
model, information = train_model(control_net_train_dataloader, num_epochs=1)

In [None]:
model_path = "/content/drive/MyDrive/Hvar/control_net_fashion_model.pt"
model.save_pretrained(model_path, from_pt=True) #saving in the drive

results_path = "/content/drive/MyDrive/Hvar/control_net_fashion_model_train.pkl"
pk.dump(information, open(results_path, "wb"))

**Pushing model to hugging face**

In [None]:
model_repo_name = "wiusdy/blip_pretrained_control_net_fashion_finetuning"
model.push_to_hub(model_repo_name)