Spaces:

team-indain-image-caption
/

Hindi-image-captioning

Runtime error

seanbenhur commited on Nov 23, 2021

Commit

00ca6f9

•

1 Parent(s): a94eec7

fix bugs

Files changed (1) hide show

app.py CHANGED Viewed

@@ -19,11 +19,16 @@ def post_process(text):
         pass
     return text
 def predict(image, max_length=64, num_beams=4):
     pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
     pixel_values = pixel_values.to(device)
     with torch.no_grad():
-        text = model.generate(pixel_values.unsqueeze(0).cpu())
-        text = tokenizer.decode(text.replace('<|endoftext|>', '').split('\n')[0],'\n\n\n')
        # output_ids = model.generate(
         #    pixel_values,
         #    max_length=max_length,
@@ -33,10 +38,10 @@ def predict(image, max_length=64, num_beams=4):
     #preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
     #pred = post_process(preds[0])
-    return text
 model_path = "team-indain-image-caption/hindi-image-captioning"
-device = torch.device("cuda:0" if torch.cuda.is_available() else"cpu")
 # Load model.
 model = VisionEncoderDecoderModel.from_pretrained(model_path)
 model.to(device)

         pass
     return text
 def predict(image, max_length=64, num_beams=4):
+    image = image.convert('RGB')
     pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
     pixel_values = pixel_values.to(device)
     with torch.no_grad():
+        text = tokenizer.decode(model.generate(pixel_values.cpu())[0])
+        text = text.replace('<|endoftext|>', '').split('\n')
+#[0],'\n\n\n'
+#text[0]
+        #text = model.generate(pixel_values.cpu())
+        #text = tokenizer.decode(text.replace('<|endoftext|>', '').split('\n')[0],'\n\n\n')
        # output_ids = model.generate(
         #    pixel_values,
         #    max_length=max_length,
     #preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
     #pred = post_process(preds[0])
+    return text[0]
 model_path = "team-indain-image-caption/hindi-image-captioning"
+device = "cpu"
 # Load model.
 model = VisionEncoderDecoderModel.from_pretrained(model_path)
 model.to(device)