matanmichaely commited on
Commit
3c37a29
1 Parent(s): d7ef93f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -23
app.py CHANGED
@@ -1,14 +1,15 @@
1
  from dotenv import find_dotenv, load_dotenv
2
  from transformers import pipeline
 
 
 
 
3
  import streamlit as st
4
- import os
5
 
6
-
7
- # load env variables from .env file
8
  load_dotenv(find_dotenv())
9
 
10
- # img to text
11
- def img_to_text(url):
12
  image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
13
 
14
  text = image_to_text(url)[0]["generated_text"]
@@ -16,33 +17,44 @@ def img_to_text(url):
16
 
17
 
18
  # llm
19
- def generate_story(text):
20
- generator = pipeline("text-generation", model="distilgpt2")
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- result = generator(text, max_length=20, num_return_sequences=1)
23
- return result[0]['generated_text']
 
24
 
25
 
26
  #
27
  # text-to-speech
28
  def text_to_speech(text):
29
- import requests
30
-
31
- API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
32
- headers = {"Authorization": f"Bearer {os.environ.get('HUGGINGFACE_API_TOKEN')}"}
33
- payload = {
34
- "inputs": text
35
- }
36
 
37
- response = requests.post(API_URL, headers=headers, json=payload)
38
- response.raise_for_status()
39
- with open('audio.flac', 'wb') as file:
40
- file.write(response.content)
41
 
 
 
 
42
 
43
 
44
  def main():
45
- st.set_page_config(page_title="img to audio story")
46
  st.header("turn image to audio story")
47
  uploaded_file = st.file_uploader("Choose an image ... ", type="jpg")
48
 
@@ -52,7 +64,7 @@ def main():
52
  with open(uploaded_file.name, "wb") as file:
53
  file.write(bytes_data)
54
  st.image(uploaded_file, caption="Uploaded image", use_column_width=True)
55
- text = img_to_text(uploaded_file.name)
56
  story = generate_story(text)
57
  text_to_speech(story)
58
 
@@ -60,6 +72,7 @@ def main():
60
  st.write(text)
61
  with st.expander("story"):
62
  st.write(story)
63
- st.audio("audio.flac")
 
64
 
65
  main()
 
1
  from dotenv import find_dotenv, load_dotenv
2
  from transformers import pipeline
3
+ from transformers import AutoProcessor, AutoModel
4
+ from langchain import PromptTemplate, LLMChain
5
+ from langchain.llms import GooglePalm
6
+ import scipy
7
  import streamlit as st
 
8
 
 
 
9
  load_dotenv(find_dotenv())
10
 
11
+ # img2text
12
+ def img_2_text(url):
13
  image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
14
 
15
  text = image_to_text(url)[0]["generated_text"]
 
17
 
18
 
19
  # llm
20
+ def generate_story(scenario):
21
+ template = """"
22
+ You are a story teller;
23
+ you can generate a creative fun story based on a sample narrative, the story should not be more than 100 words;
24
+ CONTEXT: {scenario}
25
+ STORY:
26
+ """
27
+
28
+ prompt = PromptTemplate(template=template,
29
+ input_variables=['scenario']
30
+ )
31
+ llm = GooglePalm(temperature=0.7)
32
+
33
+ story_llm = LLMChain(llm=llm, prompt=prompt, verbose=True)
34
 
35
+ story = story_llm.predict(scenario=scenario)
36
+
37
+ return story
38
 
39
 
40
  #
41
  # text-to-speech
42
  def text_to_speech(text):
43
+ processor = AutoProcessor.from_pretrained("suno/bark-small")
44
+ model = AutoModel.from_pretrained("suno/bark-small")
 
 
 
 
 
45
 
46
+ inputs = processor(
47
+ text=[text],
48
+ return_tensors="pt",
49
+ )
50
 
51
+ speech_values = model.generate(**inputs, do_sample=True)
52
+ sampling_rate = model.generation_config.sample_rate
53
+ scipy.io.wavfile.write("audio.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
54
 
55
 
56
  def main():
57
+ st.set_page_config(page_title="img 2 audio story")
58
  st.header("turn image to audio story")
59
  uploaded_file = st.file_uploader("Choose an image ... ", type="jpg")
60
 
 
64
  with open(uploaded_file.name, "wb") as file:
65
  file.write(bytes_data)
66
  st.image(uploaded_file, caption="Uploaded image", use_column_width=True)
67
+ text = img_2_text(uploaded_file.name)
68
  story = generate_story(text)
69
  text_to_speech(story)
70
 
 
72
  st.write(text)
73
  with st.expander("story"):
74
  st.write(story)
75
+ st.audio("audio.wav")
76
+
77
 
78
  main()