Spaces:

dipta-dataist
/

Image-to-Story-Generation

Sleeping

Diptaraj Sen commited on Apr 27

Commit

a7d58d8

1 Parent(s): 5bdfd55

captioning model updated

Files changed (2) hide show

.gitignore CHANGED Viewed

@@ -2,6 +2,7 @@ venv/
 __pycache__/
 outputs/
 logs/
 test.ipynb
 .streamlit/secrets.toml
 *.pyc

 __pycache__/
 outputs/
 logs/
+assets/
 test.ipynb
 .streamlit/secrets.toml
 *.pyc

app/captioning.py CHANGED Viewed

@@ -1,19 +1,21 @@
 from app.logger import get_logger
 logger = get_logger(__name__)
-from transformers import AutoTokenizer, AutoImageProcessor, VisionEncoderDecoderModel
-from PIL import Image
 import torch
-model_id = "cnmoro/nano-image-captioning"
 # Load model, tokenizer, and image processor
-model = VisionEncoderDecoderModel.from_pretrained(model_id)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-image_processor = AutoImageProcessor.from_pretrained(model_id)
 # Move model to GPU if available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print("DEVICE:--------> ",device)
 model.to(device)
 def generate_caption(image_path: str) -> str:
@@ -23,15 +25,15 @@ def generate_caption(image_path: str) -> str:
         image = Image.open(image_path).convert('RGB')
         # Preprocess image and prepare inputs
-        inputs = image_processor(images=image, return_tensors="pt").to(device)
         # Generate caption (greedy decoding for now)
-        output = model.generate(**inputs, max_length=30, num_beams=1)
-        # Decode output to text
-        caption = tokenizer.decode(output[0], skip_special_tokens=True)
-        logger.info(f"Caption generated: {caption}")
         return caption
     except Exception as e:
         logger.exception("Failed to generate caption")

 from app.logger import get_logger
 logger = get_logger(__name__)
 import torch
+from PIL import Image
+from transformers import ViTFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel
+model_id = "ydshieh/vit-gpt2-coco-en"
 # Load model, tokenizer, and image processor
+feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = VisionEncoderDecoderModel.from_pretrained(model_id)
+model.eval()
 # Move model to GPU if available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+logger.info(f"DEVICE:--------> {device}")
 model.to(device)
 def generate_caption(image_path: str) -> str:
         image = Image.open(image_path).convert('RGB')
         # Preprocess image and prepare inputs
+        pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
         # Generate caption (greedy decoding for now)
+        with torch.no_grad():
+            output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
+        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        caption = [pred.strip() for pred in preds]
         return caption
     except Exception as e:
         logger.exception("Failed to generate caption")