|
import gradio as gr |
|
import torch |
|
from PIL import Image |
|
from gtts import gTTS |
|
import numpy as np |
|
import cv2 |
|
from skimage.feature import greycomatrix, greycoprops |
|
from transformers import BlipProcessor, BlipForConditionalGeneration, MarianMTModel, MarianTokenizer |
|
|
|
|
|
model = torch.hub.load('ultralytics/yolov5', 'yolov5s') |
|
|
|
|
|
def analyze_texture(image): |
|
gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY) |
|
glcm = greycomatrix(gray_image, distances=[5], angles=[0], levels=256, symmetric=True, normed=True) |
|
contrast = greycoprops(glcm, 'contrast')[0, 0] |
|
return contrast |
|
|
|
|
|
def describe_image(image): |
|
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
|
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") |
|
inputs = processor(image, return_tensors="pt") |
|
out = model.generate(**inputs) |
|
description = processor.decode(out[0], skip_special_tokens=True) |
|
return description |
|
|
|
|
|
def translate_description(description): |
|
model_name = 'Helsinki-NLP/opus-mt-en-pt' |
|
tokenizer = MarianTokenizer.from_pretrained(model_name) |
|
model = MarianMTModel.from_pretrained(model_name) |
|
translated = model.generate(**tokenizer(description, return_tensors="pt", padding=True)) |
|
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True) |
|
return translated_text |
|
|
|
|
|
def process_image(image): |
|
|
|
results = model(image) |
|
detected_image = results.render()[0] |
|
|
|
|
|
mean_rgb = np.mean(np.array(image), axis=(0, 1)) |
|
|
|
|
|
texture_contrast = analyze_texture(image) |
|
|
|
|
|
description = describe_image(image) |
|
translated_description = translate_description(description) |
|
|
|
|
|
tts = gTTS(text=translated_description, lang='pt') |
|
tts.save("output.mp3") |
|
|
|
|
|
return Image.fromarray(detected_image), translated_description, "output.mp3" |
|
|
|
|
|
example_image = Image.open("/mnt/data/example1.JPG") |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_image, |
|
inputs=gr.inputs.Image(type="pil"), |
|
outputs=[gr.outputs.Image(type="pil"), gr.outputs.Textbox(), gr.outputs.Audio(type="file")], |
|
examples=[example_image] |
|
) |
|
|
|
iface.launch() |
|
|