Joe Meng commited on
Commit
18213b1
1 Parent(s): da831a3

basic img2story

Browse files
Files changed (6) hide show
  1. .gitignore +1 -0
  2. app.py +63 -3
  3. audio.flac +0 -0
  4. requirements.txt +5 -1
  5. test.jpg +0 -0
  6. test1.jpeg +0 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
app.py CHANGED
@@ -1,23 +1,83 @@
1
  from dotenv import find_dotenv, load_dotenv
2
  from transformers import pipeline
 
 
 
 
3
 
4
  load_dotenv(find_dotenv())
 
5
 
6
  # img2text
7
  def img2text(url):
8
- image_to_text_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
9
- text = image_to_text_model(url)
10
 
11
  print(text)
12
  return text
13
 
14
- img2text("test.jpg")
15
  # make the story of it using LLM
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
 
19
  # text to speech
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
 
21
 
 
 
 
 
22
 
 
 
 
 
 
 
 
 
23
 
 
 
 
1
  from dotenv import find_dotenv, load_dotenv
2
  from transformers import pipeline
3
+ from langchain import PromptTemplate, LLMChain, OpenAI
4
+ import requests
5
+ import os
6
+ import streamlit as st
7
 
8
  load_dotenv(find_dotenv())
9
+ HF_API_KEY=os.getenv("HF_API_KEY")
10
 
11
  # img2text
12
  def img2text(url):
13
+ image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
14
+ text = image_to_text_model(url)[0]["generated_text"]
15
 
16
  print(text)
17
  return text
18
 
19
+
20
  # make the story of it using LLM
21
+ def generate_story(scenario):
22
+ template = """
23
+ You are a story teller;
24
+ You can generate a short story based on a simple narrative, the story should be no more than 30 words;
25
+
26
+ CONTEXT: {scenario}
27
+ STORY;
28
+ """
29
+
30
+ prompt = PromptTemplate(template=template, input_variables=["scenario"])
31
+
32
+ story_llm = LLMChain(llm=OpenAI(model_name="gpt-4", temperature=1), prompt=prompt, verbose=True)
33
+ story = story_llm.predict(scenario=scenario).replace('"', '')
34
+
35
+ print(story)
36
+ return story
37
 
38
 
39
 
40
  # text to speech
41
+ def text2speech(message):
42
+ API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
43
+ headers = {"Authorization": f"Bearer {HF_API_KEY}"}
44
+ payload = {
45
+ "inputs": message
46
+ }
47
+
48
+ response = requests.post(API_URL, headers=headers, json=payload)
49
+ with open('audio.flac', 'wb') as file:
50
+ file.write(response.content)
51
+
52
+ # generate_story(img2text("test1.jpeg"))
53
+ # text2speech("Access tokens programmatically authenticate your identity to the Hugging Face Hub")
54
+
55
+ def main():
56
+ st.set_page_config(page_title="image-to-audio-story", page_icon="😊")
57
+ st.header("Image to audio story")
58
+ uploaded_file = st.file_uploader("Choose an image", type=['png', 'jpg'])
59
+
60
+ if uploaded_file is not None:
61
+ print(uploaded_file)
62
+ bytes_data = uploaded_file.getvalue()
63
+ with open(uploaded_file.name, "wb") as file:
64
+ file.write(bytes_data)
65
 
66
+ st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
67
 
68
+ st.text('Processing img2text...')
69
+ scenario = img2text(uploaded_file.name)
70
+ with st.expander("scenario"):
71
+ st.write(scenario)
72
 
73
+ st.text('Generating story on given scenario...')
74
+ story = generate_story(scenario)
75
+ with st.expander("story"):
76
+ st.write(story)
77
+
78
+ st.text('Processing text2speech...')
79
+ text2speech(story)
80
+ st.audio("audio.flac")
81
 
82
+ if __name__ == '__main__':
83
+ main()
audio.flac ADDED
Binary file (460 kB). View file
 
requirements.txt CHANGED
@@ -1,2 +1,6 @@
1
  dotenv
2
- transformers
 
 
 
 
 
1
  dotenv
2
+ transformers
3
+ langchain
4
+ openai
5
+ requests
6
+ streamlit
test.jpg ADDED
test1.jpeg ADDED