Steven-GU-Yu-Di commited on
Commit
afdfd46
1 Parent(s): 0b69f56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -8
app.py CHANGED
@@ -5,7 +5,12 @@ from gtts import gTTS
5
  import os
6
 
7
  # Load the Visual Question Answering (VQA) model
8
- #vqa_model = pipeline("question-answering")
 
 
 
 
 
9
 
10
  # Create a Streamlit app
11
  st.title("Visual Question Answering and Text-to-Speech")
@@ -14,8 +19,8 @@ st.title("Visual Question Answering and Text-to-Speech")
14
  uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
15
  question_input = st.text_input("Enter Question")
16
 
17
- # Function to perform Visual Question Answering
18
- def perform_vqa(image, question):
19
  if image is not None and question:
20
  image = Image.open(image)
21
  st.image(image, caption="Uploaded Image", use_column_width=True)
@@ -30,11 +35,11 @@ def perform_vqa(image, question):
30
  answer = vqa_output['answer']
31
  st.write("Answer:", answer)
32
 
33
- # Text-to-Speech using gTTS
34
- tts = gTTS(answer)
35
- tts.save("output.mp3")
36
- st.audio("output.mp3", format='audio/mp3')
37
 
38
  # Button to trigger Visual Question Answering and Text-to-Speech
39
  if st.button("Perform VQA and TTS"):
40
- perform_vqa(uploaded_image, question_input)
 
5
  import os
6
 
7
  # Load the Visual Question Answering (VQA) model
8
+ vqa_model = pipeline("text-generation", model="Steven-GU-Yu-Di/Visual-Question-Answering")
9
+
10
+
11
+ # Load the Text-to-Speech (TTS) model
12
+ tts = pipeline("text-to-audio", model="Steven-GU-Yu-Di/Text-to-Speech")
13
+
14
 
15
  # Create a Streamlit app
16
  st.title("Visual Question Answering and Text-to-Speech")
 
19
  uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
20
  question_input = st.text_input("Enter Question")
21
 
22
+ # Function to perform Visual Question Answering and Text-to-Speech
23
+ def perform_vqa_and_tts(image, question):
24
  if image is not None and question:
25
  image = Image.open(image)
26
  st.image(image, caption="Uploaded Image", use_column_width=True)
 
35
  answer = vqa_output['answer']
36
  st.write("Answer:", answer)
37
 
38
+ # Text-to-Speech using TTS model
39
+ audio_output = tts(answer)
40
+ audio_bytes = audio_output[0]['audio']
41
+ st.audio(audio_bytes, format='audio/wav')
42
 
43
  # Button to trigger Visual Question Answering and Text-to-Speech
44
  if st.button("Perform VQA and TTS"):
45
+ perform_vqa_and_tts(uploaded_image, question_input)