Sensory-Feedback-System-TextOnly

Runtime error

ronniet commited on Oct 17, 2023

Commit

5bad71b

•

1 Parent(s): 0349c26

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ def tts(text):
     # limit input length
     input_ids = inputs["input_ids"]
-    input_ids = input_ids[..., :model.config.max_text_positions]
     # if speaker == "Surprise Me!":
     #     # load one of the provided speaker embeddings at random
@@ -58,7 +58,7 @@ def tts(text):
 # tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
-def predict(image):
     # text = captioner(image)[0]["generated_text"]
     # audio_output = "output.wav"
@@ -66,7 +66,7 @@ def predict(image):
     pixel_values = vqa_processor(images=image, return_tensors="pt").pixel_values
-    prompt = "what is in the scene?"
     prompt_ids = vqa_processor(text=prompt, add_special_tokens=False).input_ids
     prompt_ids = [vqa_processor.tokenizer.cls_token_id] + prompt_ids
     prompt_ids = torch.tensor(prompt_ids).unsqueeze(0)
@@ -81,7 +81,7 @@ def predict(image):
 demo = gr.Interface(
     fn=predict,
-    inputs=gr.Image(type="pil",label="Environment"),
     outputs=[gr.Textbox(label="Caption"), gr.Audio(type="numpy",label="Audio Feedback")],
     css=".gradio-container {background-color: #002A5B}",
     theme=gr.themes.Soft()

     # limit input length
     input_ids = inputs["input_ids"]
+    input_ids = input_ids[..., :tts_model.config.max_text_positions]
     # if speaker == "Surprise Me!":
     #     # load one of the provided speaker embeddings at random
 # tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
+def predict(image, prompt):
     # text = captioner(image)[0]["generated_text"]
     # audio_output = "output.wav"
     pixel_values = vqa_processor(images=image, return_tensors="pt").pixel_values
+    # prompt = "what is in the scene?"
     prompt_ids = vqa_processor(text=prompt, add_special_tokens=False).input_ids
     prompt_ids = [vqa_processor.tokenizer.cls_token_id] + prompt_ids
     prompt_ids = torch.tensor(prompt_ids).unsqueeze(0)
 demo = gr.Interface(
     fn=predict,
+    inputs=[gr.Image(type="pil",label="Environment"), gr.Textbox(label="Prompt", value="What is in the scene?")]
     outputs=[gr.Textbox(label="Caption"), gr.Audio(type="numpy",label="Audio Feedback")],
     css=".gradio-container {background-color: #002A5B}",
     theme=gr.themes.Soft()