Diptaraj Sen
commited on
Commit
·
a7d58d8
1
Parent(s):
5bdfd55
captioning model updated
Browse files- .gitignore +1 -0
- app/captioning.py +15 -13
.gitignore
CHANGED
@@ -2,6 +2,7 @@ venv/
|
|
2 |
__pycache__/
|
3 |
outputs/
|
4 |
logs/
|
|
|
5 |
test.ipynb
|
6 |
.streamlit/secrets.toml
|
7 |
*.pyc
|
|
|
2 |
__pycache__/
|
3 |
outputs/
|
4 |
logs/
|
5 |
+
assets/
|
6 |
test.ipynb
|
7 |
.streamlit/secrets.toml
|
8 |
*.pyc
|
app/captioning.py
CHANGED
@@ -1,19 +1,21 @@
|
|
1 |
from app.logger import get_logger
|
2 |
logger = get_logger(__name__)
|
3 |
|
4 |
-
from transformers import AutoTokenizer, AutoImageProcessor, VisionEncoderDecoderModel
|
5 |
-
from PIL import Image
|
6 |
import torch
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
model_id = "cnmoro/nano-image-captioning"
|
9 |
# Load model, tokenizer, and image processor
|
10 |
-
|
11 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
12 |
-
|
|
|
13 |
|
14 |
# Move model to GPU if available
|
15 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
16 |
-
|
17 |
model.to(device)
|
18 |
|
19 |
def generate_caption(image_path: str) -> str:
|
@@ -23,15 +25,15 @@ def generate_caption(image_path: str) -> str:
|
|
23 |
image = Image.open(image_path).convert('RGB')
|
24 |
|
25 |
# Preprocess image and prepare inputs
|
26 |
-
|
27 |
|
28 |
# Generate caption (greedy decoding for now)
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
return caption
|
36 |
except Exception as e:
|
37 |
logger.exception("Failed to generate caption")
|
|
|
1 |
from app.logger import get_logger
|
2 |
logger = get_logger(__name__)
|
3 |
|
|
|
|
|
4 |
import torch
|
5 |
+
from PIL import Image
|
6 |
+
from transformers import ViTFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel
|
7 |
+
|
8 |
+
model_id = "ydshieh/vit-gpt2-coco-en"
|
9 |
|
|
|
10 |
# Load model, tokenizer, and image processor
|
11 |
+
feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)
|
12 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
13 |
+
model = VisionEncoderDecoderModel.from_pretrained(model_id)
|
14 |
+
model.eval()
|
15 |
|
16 |
# Move model to GPU if available
|
17 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
18 |
+
logger.info(f"DEVICE:--------> {device}")
|
19 |
model.to(device)
|
20 |
|
21 |
def generate_caption(image_path: str) -> str:
|
|
|
25 |
image = Image.open(image_path).convert('RGB')
|
26 |
|
27 |
# Preprocess image and prepare inputs
|
28 |
+
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
|
29 |
|
30 |
# Generate caption (greedy decoding for now)
|
31 |
+
with torch.no_grad():
|
32 |
+
output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
|
33 |
+
|
34 |
+
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
35 |
+
caption = [pred.strip() for pred in preds]
|
36 |
+
|
37 |
return caption
|
38 |
except Exception as e:
|
39 |
logger.exception("Failed to generate caption")
|