Steven-GU-Yu-Di commited on
Commit
b82e401
1 Parent(s): fb8195d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -24
app.py CHANGED
@@ -1,28 +1,46 @@
 
 
 
 
1
  import streamlit as st
 
 
 
 
2
  from transformers import pipeline
3
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- # Load the text classification model
6
- classifier = pipeline("text-generation", model="Steven-GU-Yu-Di/Visual-Question-Answering")
7
-
8
- # Create a Streamlit app
9
- st.title("Image and Text Classification")
10
-
11
- # Sidebar for user inputs
12
- st.sidebar.title("Input")
13
- uploaded_image = st.sidebar.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
14
- text_input = st.sidebar.text_input("Enter Text Description")
15
-
16
- # Function to classify image and text
17
- def classify(image, text):
18
- if image is not None and text:
19
- image = Image.open(image)
20
- st.image(image, caption="Uploaded Image", use_column_width=True)
21
- st.write("Text Description:", text)
22
- result = classifier(text)
23
- st.write("Classification Result:")
24
- st.write(result)
25
-
26
- # Button to trigger classification
27
- if st.sidebar.button("Classify"):
28
- classify(uploaded_image, text_input)
 
1
+ import os
2
+
3
+ os.system('pip install -r requirements.txt')
4
+
5
  import streamlit as st
6
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
+ from datasets import load_dataset
8
+ import torch
9
+ import soundfile as sf
10
  from transformers import pipeline
11
  from PIL import Image
12
+ import io
13
+
14
+ st.title('Video to text and then text to speech app')
15
+
16
+
17
+ image = st.file_uploader("Upload an image", type=["jpg", "png"])
18
+
19
+ question = st.text_input(
20
+ label="Enter your question",
21
+ value = "How many people and what is the color of this image?"
22
+ )
23
+
24
+ def generate_speech(text):
25
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
26
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
27
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
28
+ inputs = processor(text=text, return_tensors="pt")
29
+
30
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
31
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
32
+
33
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
34
+
35
+ sf.write("speech.wav", speech.numpy(), samplerate=16000)
36
 
37
+ if st.button("Generate"):
38
+ image = Image.open(io.BytesIO(image.getvalue()))
39
+ vqa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
40
+ vqa_result = vqa_pipeline({"image": image, "question": question})
41
+ answer = vqa_result[0]['answer']
42
+ st.write(f"Question: {question} Answer: {answer}") # 显示回答
43
+ generate_speech(f"Question: {question}, Answer: {answer}")
44
+ audio_file = open("speech.wav", 'rb')
45
+ audio_bytes = audio_file.read()
46
+ st.audio(audio_bytes, format="audio/wav")