Cam2Speech / app.py
st0bb3n's picture
Update app.py
198fce8
raw
history blame
1.28 kB
from transformers import ViTFeatureExtractor, ViTForImageClassification
import gradio as gr
from datasets import load_dataset
import torch
dataset = load_dataset("cifar100")
image = dataset["train"]["fine_label"]
def classify(image):
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
inputs = feature_extractor(images=image, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
predicted_class_idx = logits.argmax(-1).item()
return model.config.id2label[predicted_class_idx]
def image2speech(image):
txt = classify(image)
return fastspeech(txt), txt
fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")
app = gr.Interface(fn=image2speech,
inputs="image",
title="Image to speech",
description="Classifies and image and tell you what is it",
examples=["remotecontrol.jpg", "calculator.jpg", "cellphone.jpg"],
allow_flagging="never",
outputs=["audio", "text"])
app.launch()