Diptaraj Sen commited on
Commit
a7d58d8
·
1 Parent(s): 5bdfd55

captioning model updated

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app/captioning.py +15 -13
.gitignore CHANGED
@@ -2,6 +2,7 @@ venv/
2
  __pycache__/
3
  outputs/
4
  logs/
 
5
  test.ipynb
6
  .streamlit/secrets.toml
7
  *.pyc
 
2
  __pycache__/
3
  outputs/
4
  logs/
5
+ assets/
6
  test.ipynb
7
  .streamlit/secrets.toml
8
  *.pyc
app/captioning.py CHANGED
@@ -1,19 +1,21 @@
1
  from app.logger import get_logger
2
  logger = get_logger(__name__)
3
 
4
- from transformers import AutoTokenizer, AutoImageProcessor, VisionEncoderDecoderModel
5
- from PIL import Image
6
  import torch
 
 
 
 
7
 
8
- model_id = "cnmoro/nano-image-captioning"
9
  # Load model, tokenizer, and image processor
10
- model = VisionEncoderDecoderModel.from_pretrained(model_id)
11
  tokenizer = AutoTokenizer.from_pretrained(model_id)
12
- image_processor = AutoImageProcessor.from_pretrained(model_id)
 
13
 
14
  # Move model to GPU if available
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
- print("DEVICE:--------> ",device)
17
  model.to(device)
18
 
19
  def generate_caption(image_path: str) -> str:
@@ -23,15 +25,15 @@ def generate_caption(image_path: str) -> str:
23
  image = Image.open(image_path).convert('RGB')
24
 
25
  # Preprocess image and prepare inputs
26
- inputs = image_processor(images=image, return_tensors="pt").to(device)
27
 
28
  # Generate caption (greedy decoding for now)
29
- output = model.generate(**inputs, max_length=30, num_beams=1)
30
-
31
- # Decode output to text
32
- caption = tokenizer.decode(output[0], skip_special_tokens=True)
33
-
34
- logger.info(f"Caption generated: {caption}")
35
  return caption
36
  except Exception as e:
37
  logger.exception("Failed to generate caption")
 
1
  from app.logger import get_logger
2
  logger = get_logger(__name__)
3
 
 
 
4
  import torch
5
+ from PIL import Image
6
+ from transformers import ViTFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel
7
+
8
+ model_id = "ydshieh/vit-gpt2-coco-en"
9
 
 
10
  # Load model, tokenizer, and image processor
11
+ feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)
12
  tokenizer = AutoTokenizer.from_pretrained(model_id)
13
+ model = VisionEncoderDecoderModel.from_pretrained(model_id)
14
+ model.eval()
15
 
16
  # Move model to GPU if available
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ logger.info(f"DEVICE:--------> {device}")
19
  model.to(device)
20
 
21
  def generate_caption(image_path: str) -> str:
 
25
  image = Image.open(image_path).convert('RGB')
26
 
27
  # Preprocess image and prepare inputs
28
+ pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
29
 
30
  # Generate caption (greedy decoding for now)
31
+ with torch.no_grad():
32
+ output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
33
+
34
+ preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
35
+ caption = [pred.strip() for pred in preds]
36
+
37
  return caption
38
  except Exception as e:
39
  logger.exception("Failed to generate caption")