rrquizon1's picture
Update app.py
c46bd10
from transformers import MarianTokenizer, MarianMTModel
from gtts import gTTS
import gradio as gr
import gradio as gr
import torch
import torchvision
import torchvision.transforms as transforms
import requests
from einops import rearrange
from transformers import AutoFeatureExtractor, DeiTForImageClassificationWithTeacher
import matplotlib
def imgtrans(img):
feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-384')
model = DeiTForImageClassificationWithTeacher.from_pretrained('facebook/deit-base-distilled-patch16-384')
inputs = feature_extractor(images=img, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 21,841 ImageNet-22k classes
predicted_class_idx = logits.argmax(-1).item()
english=model.config.id2label[predicted_class_idx]
english=english.replace("_", " ")
english=english.split(',',1)[0]
src = "en" # source language
trg = "tl" # target language
model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)
sample_text = english.lower()
batch = tokenizer([sample_text], return_tensors="pt")
generated_ids = model.generate(**batch)
fil=tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0];
tts=gTTS(text=fil,lang='tl')
tts.save('filtrans.wav')
fil_sound='filtrans.wav'
english=english.lower()
tts=gTTS(text=english,lang='en')
tts.save('engtrans.wav')
eng_sound='engtrans.wav'
return fil_sound,fil,eng_sound,english
interface=gr.Interface(fn=imgtrans,
inputs=gr.inputs.Image(shape=(224,224),label='Insert Image'),
outputs=[gr.outputs.Audio(label='Filipino Pronunciation'),gr.outputs.Textbox(label='Filipino Label'),
gr.outputs.Audio(label='English Pronunciation'),gr.outputs.Textbox(label='English label')],
examples = ['220px-Modern_British_LED_Traffic_Light.jpg','aki_dog.jpg','cat.jpg','dog.jpg','plasticbag.jpg',
'telephone.jpg','vpavic_211006_4796_0061.jpg','watch.jpg','wonder_cat.jpg','hammer.jpg'],description='A Filipino Classifier with Pronunciation using Distilled Data-efficient Image Transformer (base-sized model) for classification and OPUS MT English to Tagalog translation by University of Helsinki. Google Text to Speech module was used for text to speech for English and Filipino. ')
interface.launch()