from transformers import ViTFeatureExtractor, ViTForImageClassification import gradio as gr from datasets import load_dataset import torch dataset = load_dataset("cifar100") image = dataset["train"]["fine_label"] def classify(image): feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224') model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224') inputs = feature_extractor(images=image, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits # model predicts one of the 1000 ImageNet classes predicted_class_idx = logits.argmax(-1).item() return model.config.id2label[predicted_class_idx] def image2speech(image): txt = classify(image) return fastspeech(txt), txt fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech") app = gr.Interface(fn=image2speech, inputs="image", title="Image to speech", description="Classifies and image and tell you what is it", examples=["remotecontrol.jpg", "calculator.jpg", "cellphone.jpg"], allow_flagging="never", outputs=["audio", "text"]) app.launch()