File size: 1,938 Bytes
d6912cc
 
198fce8
 
 
1559fe8
8da78db
 
147cf50
d6912cc
2df3b4c
 
 
 
 
d6912cc
 
1559fe8
198fce8
 
 
d6912cc
1559fe8
d6912cc
 
 
 
1559fe8
d8dd5f6
 
 
 
 
d6912cc
1559fe8
 
d6912cc
 
1559fe8
8884faf
 
 
 
1559fe8
d40505d
8884faf
e3ba257
9db1a5d
e584282
 
 
1559fe8
e960600
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from transformers import ViTFeatureExtractor, ViTForImageClassification
import gradio as gr
from datasets import load_dataset
import torch


#dataset = load_dataset("cifar100")
#image = dataset["train"]["fine_label"]
#print("load and train dataset \n")

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
print("feature extractor \n")
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
print("load model \n")

def classify(image):
    inputs = feature_extractor(images=image, return_tensors="pt")
    print("define input \n")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    # model predicts one of the 1000 ImageNet classes
    print("prediction \n")
    predicted_class_idx = logits.argmax(-1).item()
    return model.config.id2label[predicted_class_idx]

def image2speech(image):
    print("tts \n")
    try:
        txt = classify(image)
    except:
        txt = "No object detected"

    return fastspeech(txt), txt
    
print("load tts interface \n")
fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")

print("sets input and outputs \n")
camera = gr.inputs.Image(label="Image from your camera", source="webcam")
read = gr.outputs.Textbox(type="auto", label="Text")
speak = gr.outputs.Audio(type="auto", label="Speech")

print("define interface \n")
app = gr.Interface(fn=image2speech, 
                   inputs=camera,
                   live=True,
                   description="Takes a snapshot of an object, identifies it, and then tell you what it is. \n Intended use is to help the visually impaired. Models and     dataset used is listed on the linked models and dataset", 
                   outputs=[speak, read],
                   examples=["remotecontrol.jpg", "calculator.jpg", "cellphone.jpg"])
                   
print("launch interface \n")
app.launch(cache_examples=True)