Spaces:

st0bb3n
/

Cam2Speech

Runtime error

App Files Files Community

st0bb3n commited on Mar 28, 2022

Commit

1559fe8

1 Parent(s): 8884faf

Update app.py

Browse files

added debug points

Files changed (1) hide show

app.py +12 -3

app.py CHANGED Viewed

@@ -3,24 +3,32 @@ import gradio as gr
 from datasets import load_dataset
 import torch
 dataset = load_dataset("cifar100")
 image = dataset["train"]["fine_label"]
 def classify(image):
     feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
     model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
     inputs = feature_extractor(images=image, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
         logits = outputs.logits
     # model predicts one of the 1000 ImageNet classes
     predicted_class_idx = logits.argmax(-1).item()
     return model.config.id2label[predicted_class_idx]
 def image2speech(image):
     txt = classify(image)
     return fastspeech(txt), txt
 fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")
 '''
@@ -35,15 +43,16 @@ app = gr.Interface(fn=image2speech,
 app.launch(cache_examples=True)
 '''
 camera = gr.inputs.Image(label="Image from your camera", source="webcam")
 read = gr.outputs.Textbox(type="auto", label="Text")
 speak = gr.outputs.Audio(type="auto", label="Speech")
 app = gr.Interface(fn=image2speech,
                    inputs=camera,
                    live=True,
                    description="Takes a snapshot of an object, identifies it, and then tell you what it is. \n Intended use is to help the visually impaired. Models and     dataset used is listed on the linked models and dataset",
                    outputs=[speak, read])
 app.launch()

 from datasets import load_dataset
 import torch
 dataset = load_dataset("cifar100")
 image = dataset["train"]["fine_label"]
+print("load and train dataset \n")
 def classify(image):
     feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
+    print("feature extractor \n")
     model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
+    print("load model \n")
     inputs = feature_extractor(images=image, return_tensors="pt")
+    print("define input \n")
     with torch.no_grad():
         outputs = model(**inputs)
         logits = outputs.logits
     # model predicts one of the 1000 ImageNet classes
+    print("prediction \n")
     predicted_class_idx = logits.argmax(-1).item()
     return model.config.id2label[predicted_class_idx]
 def image2speech(image):
+    print("tts \n")
     txt = classify(image)
     return fastspeech(txt), txt
+print("load tts interface \n")
 fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")
 '''
 app.launch(cache_examples=True)
 '''
+print("sets input and outputs \n")
 camera = gr.inputs.Image(label="Image from your camera", source="webcam")
 read = gr.outputs.Textbox(type="auto", label="Text")
 speak = gr.outputs.Audio(type="auto", label="Speech")
+print("define interface \n")
 app = gr.Interface(fn=image2speech,
                    inputs=camera,
                    live=True,
                    description="Takes a snapshot of an object, identifies it, and then tell you what it is. \n Intended use is to help the visually impaired. Models and     dataset used is listed on the linked models and dataset",
                    outputs=[speak, read])
+print("launch interface \n")
 app.launch()