Spaces:

Steven-GU-Yu-Di
/

ISOM5240-Group4-git-base-vqa

Runtime error

App Files Files Community

Steven-GU-Yu-Di commited on Mar 22

Commit

d1be192

•

1 Parent(s): 6b9548b

Upload 14 files

Browse files

Files changed (14) hide show

app.py +35 -0
blip-vqa-finetune-main/.DS_Store +0 -0
blip-vqa-finetune-main/Data/.DS_Store +0 -0
blip-vqa-finetune-main/Data/test_data/.DS_Store +0 -0
blip-vqa-finetune-main/Data/test_data/14398/data.json +4 -0
blip-vqa-finetune-main/Data/test_data/14398/image.png +0 -0
blip-vqa-finetune-main/Data/train.jsonl +1 -0
blip-vqa-finetune-main/Data/train_fill_in_blank/.DS_Store +0 -0
blip-vqa-finetune-main/Data/train_fill_in_blank/77070/data.json +7 -0
blip-vqa-finetune-main/Data/train_fill_in_blank/77070/image.png +0 -0
blip-vqa-finetune-main/README.md +31 -0
blip-vqa-finetune-main/finetuning.py +129 -0
blip-vqa-finetune-main/prediction.py +54 -0
blip-vqa-finetune-main/requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+os.system('pip install torch')
+os.system('pip install transformers')
+from PIL import Image
+import io
+import streamlit as st
+from transformers import pipeline
+vqa_pipeline = pipeline("visual-question-answering", model="microsoft/git-base-vqav2")
+tts_pipeline = pipeline("text-to-speech", "suno/bark")
+def main():
+    st.title("Visual Question Answering & Text-to-Audio App")
+    image = st.file_uploader("Upload an image", type=["jpg", "png"])
+    question = st.text_input("Enter your question")
+    if image and question:
+        image = Image.open(io.BytesIO(image.getvalue()))
+        vqa_result = vqa_pipeline({"image": image, "question": question})
+        answer = vqa_result[0]['answer']
+        st.write(f"Answer: {answer}")
+        if st.button("Convert Answer to Audio"):
+            tts_result = tts_pipeline(answer)
+            audio_data = tts_result['audio']
+            st.audio(audio_data, format="audio/ogg")
+if __name__ == "__main__":
+    main()

blip-vqa-finetune-main/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

blip-vqa-finetune-main/Data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

blip-vqa-finetune-main/Data/test_data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

blip-vqa-finetune-main/Data/test_data/14398/data.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "question": "Move the ruler to measure the length of the sword to the nearest inch. The sword is about (_) inches long.",
+    "id": "14398"
+}

blip-vqa-finetune-main/Data/test_data/14398/image.png ADDED Viewed

blip-vqa-finetune-main/Data/train.jsonl ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"question": "How many shapes are purple?", "answer": "3", "ques_type": "fill_in_blank", "grade": "kindergarten", "label": "Q9", "pid": "77070"}

blip-vqa-finetune-main/Data/train_fill_in_blank/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

blip-vqa-finetune-main/Data/train_fill_in_blank/77070/data.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "question": "How many shapes are purple?",
+  "answer": "3",
+  "ques_type": "fill_in_blank",
+  "grade": "kindergarten",
+  "label": "Q9"
+}

blip-vqa-finetune-main/Data/train_fill_in_blank/77070/image.png ADDED Viewed

blip-vqa-finetune-main/README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Visual Question Answering using BLIP pre-trained model!
+This implementation applies the BLIP pre-trained model to solve the icon domain task.
+![The BLIP model for VQA task](https://i.postimg.cc/ncnxSnJw/image.png)
+|  ![enter image description here](https://i.postimg.cc/1zSYsrmm/image.png)|  |
+|--|--|
+| How many dots are there? | 36 |
+# Description
+**Note: The test dataset does not have labels. I evaluated the model via Kaggle competition and got 96% in accuracy manner. Obviously, you can use a partition of the training set as a testing set.
+## Create data folder
+Copy all data following the example form
+You can download data [here](https://drive.google.com/file/d/1tt6qJbOgevyPpfkylXpKYy-KaT4_aCYZ/view?usp=sharing)
+## Install requirements.txt
+    pip install -r requirements.txt
+## Run finetuning code
+    python finetuning.py
+## Run prediction
+    python predicting.py
+### References:
+> Nguyen Van Tuan (2023). JAIST_Advanced Machine Learning_Visual_Question_Answering

blip-vqa-finetune-main/finetuning.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import os
+import requests
+from transformers import BlipProcessor, BlipForQuestionAnswering
+from datasets import load_dataset
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import pickle
+model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+torch.cuda.empty_cache()
+torch.manual_seed(42)
+class VQADataset(torch.utils.data.Dataset):
+    """VQA (v2) dataset."""
+    def __init__(self, dataset, processor):
+        self.dataset = dataset
+        self.processor = processor
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        # get image + text
+        question = self.dataset[idx]['question']
+        answer = self.dataset[idx]['answer']
+        image_id = self.dataset[idx]['pid']
+        image_path = f"Data/train_fill_in_blank/{image_id}/image.png"
+        image = Image.open(image_path).convert("RGB")
+        text = question
+        encoding = self.processor(image, text, padding="max_length", truncation=True, return_tensors="pt")
+        labels = self.processor.tokenizer.encode(
+            answer, max_length= 8, pad_to_max_length=True, return_tensors='pt'
+        )
+        encoding["labels"] = labels
+        # remove batch dimension
+        for k,v in encoding.items():  encoding[k] = v.squeeze()
+        return encoding
+training_dataset = load_dataset("json", data_files="Data/train.jsonl", split="train[:90%]")
+valid_dataset = load_dataset("json", data_files="Data/train.jsonl", split="train[90%:]")
+print("Training sets: {} - Validating set: {}".format(len(training_dataset), len(valid_dataset)))
+train_dataset = VQADataset(dataset=training_dataset,
+                          processor=processor)
+valid_dataset = VQADataset(dataset=valid_dataset,
+                          processor=processor)
+batch_size = 12
+train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
+valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
+optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
+scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9, last_epoch=-1, verbose=False)
+num_epochs = 100
+patience = 10
+min_eval_loss = float("inf")
+early_stopping_hook = 0
+tracking_information = []
+scaler = torch.cuda.amp.GradScaler()
+for epoch in range(num_epochs):
+    epoch_loss = 0
+    model.train()
+    for idx, batch in zip(tqdm(range(len(train_dataloader)), desc='Training batch: ...'), train_dataloader):
+        input_ids = batch.pop('input_ids').to(device)
+        pixel_values = batch.pop('pixel_values').to(device)
+        attention_masked = batch.pop('attention_mask').to(device)
+        labels = batch.pop('labels').to(device)
+        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
+            outputs = model(input_ids=input_ids,
+                        pixel_values=pixel_values,
+                        # attention_mask=attention_masked,
+                        labels=labels)
+        loss = outputs.loss
+        epoch_loss += loss.item()
+        # loss.backward()
+        # optimizer.step()
+        optimizer.zero_grad()
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+    model.eval()
+    eval_loss = 0
+    for idx, batch in zip(tqdm(range(len(valid_dataloader)), desc='Validating batch: ...'), valid_dataloader):
+        input_ids = batch.pop('input_ids').to(device)
+        pixel_values = batch.pop('pixel_values').to(device)
+        attention_masked = batch.pop('attention_mask').to(device)
+        labels = batch.pop('labels').to(device)
+        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
+            outputs = model(input_ids=input_ids,
+                        pixel_values=pixel_values,
+                        attention_mask=attention_masked,
+                        labels=labels)
+        loss = outputs.loss
+        eval_loss += loss.item()
+    tracking_information.append((epoch_loss/len(train_dataloader), eval_loss/len(valid_dataloader), optimizer.param_groups[0]["lr"]))
+    print("Epoch: {} - Training loss: {} - Eval Loss: {} - LR: {}".format(epoch+1, epoch_loss/len(train_dataloader), eval_loss/len(valid_dataloader), optimizer.param_groups[0]["lr"]))
+    scheduler.step()
+    if eval_loss < min_eval_loss:
+        model.save_pretrained("Model/blip-saved-model", from_pt=True)
+        print("Saved model to Model/blip-saved-model")
+        min_eval_loss = eval_loss
+        early_stopping_hook = 0
+    else:
+        early_stopping_hook += 1
+        if early_stopping_hook > patience:
+            break
+pickle.dump(tracking_information, open("tracking_information.pkl", "wb"))
+print("The finetuning process has done!")

blip-vqa-finetune-main/prediction.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from transformers import ViltProcessor, ViltForQuestionAnswering
+from transformers import BlipProcessor, BlipForQuestionAnswering
+import requests
+from PIL import Image
+import json, os, csv
+import logging
+from tqdm import tqdm
+import torch
+# Set the path to your test data directory
+test_data_dir = "Data/test_data/test_data"
+# processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+# model = ViltForQuestionAnswering.from_pretrained("test_model/checkpoint-525")
+processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+model = BlipForQuestionAnswering.from_pretrained("Model/blip-saved-model").to("cuda")
+# Create a list to store the results
+results = []
+# Iterate through each file in the test data directory
+samples = os.listdir(test_data_dir)
+for filename in tqdm(os.listdir(test_data_dir), desc="Processing"):
+    sample_path = f"Data/test_data/{filename}"
+    # Read the json file
+    json_path = os.path.join(sample_path, "data.json")
+    with open(json_path, "r") as json_file:
+        data = json.load(json_file)
+        question = data["question"]
+        image_id = data["id"]
+    # Read the corresponding image
+    image_path = os.path.join(test_data_dir, f"{image_id}", "image.png")
+    image = Image.open(image_path).convert("RGB")
+    # prepare inputs
+    encoding = processor(image, question, return_tensors="pt").to("cuda:0", torch.float16)
+    out = model.generate(**encoding)
+    generated_text = processor.decode(out[0], skip_special_tokens=True)
+    results.append((image_id, generated_text))
+# Write the results to a CSV file
+csv_file_path = "Results/results.csv"
+with open(csv_file_path, mode="w", newline="") as csv_file:
+    csv_writer = csv.writer(csv_file)
+    csv_writer.writerow(["ID", "Label"])  # Write header
+    csv_writer.writerows(results)
+print(f"Results saved to {csv_file_path}")

blip-vqa-finetune-main/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+tqdm==4.66.1
+datasets==2.14.6
+transformers==4.35.2
+torch==2.1.0
+torchsummary==1.5.1
+torchvision==0.16.0
+Pillow==10.0.1