Steven-GU-Yu-Di commited on
Commit
16ac1d6
1 Parent(s): c85c703

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -33
app.py CHANGED
@@ -1,43 +1,35 @@
1
- import streamlit as st
2
- from transformers import pipeline
3
- from PIL import Image
4
 
5
- # Load the Visual Question Answering (VQA) model
6
- vqa_model = pipeline("text-generation", model="Steven-GU-Yu-Di/Visual-Question-Answering")
7
 
 
 
 
 
8
 
9
- # Load the Text-to-Speech (TTS) model
10
- tts = pipeline("text-to-audio", model="Steven-GU-Yu-Di/Text-to-Speech")
11
 
 
 
12
 
13
- # Create a Streamlit app
14
- st.title("Visual Question Answering and Text-to-Speech")
 
 
 
15
 
16
- # Sidebar for user inputs
17
- uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
18
- question_input = st.text_input("Enter Question")
19
 
20
- # Function to perform Visual Question Answering and Text-to-Speech
21
- def perform_vqa_and_tts(image, question):
22
- if image is not None and question:
23
- image = Image.open(image)
24
- st.image(image, caption="Uploaded Image", use_column_width=True)
25
- st.write("Question:", question)
26
 
27
- # Visual Question Answering
28
- vqa_input = {
29
- "question": question,
30
- "context": "This is an image.",
31
- }
32
- vqa_output = vqa_model(image=image, **vqa_input)
33
- answer = vqa_output['answer']
34
- st.write("Answer:", answer)
35
 
36
- # Text-to-Speech using TTS model
37
- audio_output = tts(answer)
38
- audio_bytes = audio_output[0]['audio']
39
- st.audio(audio_bytes, format='audio/wav')
40
 
41
- # Button to trigger Visual Question Answering and Text-to-Speech
42
- if st.button("Perform VQA and TTS"):
43
- perform_vqa_and_tts(uploaded_image, question_input)
 
1
+ import os
 
 
2
 
3
+ os.system('pip install torch')
4
+ os.system('pip install transformers')
5
 
6
+ from PIL import Image
7
+ import io
8
+ import streamlit as st
9
+ from transformers import pipeline
10
 
 
 
11
 
12
+ vqa_pipeline = pipeline("visual-question-answering", model="microsoft/git-base-vqav2")
13
+ tts_pipeline = pipeline("text-to-speech", "suno/bark")
14
 
15
+ def main():
16
+ st.title("Visual Question Answering & Text-to-Audio App")
17
+
18
+ image = st.file_uploader("Upload an image", type=["jpg", "png"])
19
+ question = st.text_input("Enter your question")
20
 
21
+ if image and question:
22
+ image = Image.open(io.BytesIO(image.getvalue()))
 
23
 
 
 
 
 
 
 
24
 
25
+ vqa_result = vqa_pipeline({"image": image, "question": question})
26
+ answer = vqa_result[0]['answer']
27
+ st.write(f"Answer: {answer}")
 
 
 
 
 
28
 
29
+ if st.button("Convert Answer to Audio"):
30
+ tts_result = tts_pipeline(answer)
31
+ audio_data = tts_result['audio']
32
+ st.audio(audio_data, format="audio/ogg")
33
 
34
+ if __name__ == "__main__":
35
+ main()