diamantrsd's picture
Update app.py
75c9a0c
import subprocess
# Install SentencePiece untuk keperluan translate bahasa Indonesia
subprocess.run(["pip", "install", "sentencepiece"])
from PIL import Image # library untuk image
import gradio as gr # library untuk tampilan interface di huggingface
from transformers import BlipProcessor, BlipForConditionalGeneration,MarianTokenizer, MarianMTModel #library blip (image captioning) dan marian untuk translate
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Model untuk menghasilkan caption dalam bahasa Indonesia
translation_model_id = "Helsinki-NLP/opus-mt-en-id"
translation_model = MarianMTModel.from_pretrained(translation_model_id)
translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_id)
# Model untuk menghasilkan caption dalam bahasa Inggris
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# caption_model_id = "Salesforce/blip-image-captioning-base"
# caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_id)
# caption_processor = BlipProcessor.from_pretrained(caption_model_id)
def generate_caption(input_image):
# Mengonversi gambar ke mode RGB
image = input_image.convert('RGB')
inputs = caption_processor(images=image, return_tensors="pt")
# inisial variabel inputs
inputs["max_length"] = 20
inputs["num_beams"] = 1
inputs["do_sample"] = True
inputs["top_k"] = 50
inputs["top_p"] = 0.95
# Menghasilkan caption dalam bahasa Inggris
# caption_inputs = caption_processor()
caption_output = caption_model.generate(**inputs)
english_caption = caption_processor.decode(caption_output[0], skip_special_tokens=True)
# Menerjemahkan caption ke bahasa Indonesia
translation_inputs = translation_tokenizer.encode(english_caption, return_tensors="pt", max_length=512, truncation=True)
translation_output = translation_model.generate(translation_inputs)
indonesian_caption = translation_tokenizer.decode(translation_output[0], skip_special_tokens=True)
return english_caption, indonesian_caption
iface = gr.Interface(
generate_caption,
inputs=gr.inputs.Image(type="pil"),
outputs=[gr.outputs.Textbox(type="text"), gr.outputs.Textbox(type="text")], # Dua output teks
live=True
)
iface.launch()