Spaces:

ashok2216
/

Image-to-Text

Sleeping

ashok2216 commited on Nov 12

Commit

44f1ae6

•

1 Parent(s): 70c2a12

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,35 +1,37 @@
 import streamlit as st
-from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer, AutoImageProcessor
 import torch
 from PIL import Image
-# Load the model and tokenizer
 model = VisionEncoderDecoderModel.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned")
-# processor = ViTImageProcessor.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned")
-processor = AutoImageProcessor.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned")
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-# Streamlit app title
-st.title("Image Captioning with ViT-GPT2 Model")
-st.write("Upload an image, and the model will generate a descriptive caption.")
-# File uploader for image input
 uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"])
 if uploaded_file is not None:
-    # Load and display the uploaded image
     image = Image.open(uploaded_file)
     st.image(image, caption="Uploaded Image", use_column_width=True)
-    # Preprocess the image for the model
-    inputs = processor(images=image, return_tensors="pt")
-    pixel_values = inputs.pixel_values
     # Generate the caption
     with st.spinner("Generating caption..."):
-        output = model.generate(pixel_values)
         caption = tokenizer.decode(output[0], skip_special_tokens=True)
-    # Display the generated caption
     st.success("Generated Caption:")
     st.write(caption)

 import streamlit as st
+from transformers import VisionEncoderDecoderModel, GPT2Tokenizer
 import torch
 from PIL import Image
+from torchvision import transforms
+# Load model and tokenizer
 model = VisionEncoderDecoderModel.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned")
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+# Define manual preprocessing
+preprocess = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+])
+# Streamlit app setup
+st.title("Image Captioning with ViT-GPT2")
+st.write("Upload an image to generate a caption.")
 uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"])
 if uploaded_file is not None:
     image = Image.open(uploaded_file)
     st.image(image, caption="Uploaded Image", use_column_width=True)
+    # Preprocess the image manually
+    inputs = preprocess(image).unsqueeze(0)  # Add batch dimension
     # Generate the caption
     with st.spinner("Generating caption..."):
+        output = model.generate(inputs)
         caption = tokenizer.decode(output[0], skip_special_tokens=True)
     st.success("Generated Caption:")
     st.write(caption)