jitendra.kasaudhan commited on
Commit
c22d94a
1 Parent(s): 693660f

Initial project setup with all the required steps to run the project

Browse files
Files changed (2) hide show
  1. app.py +100 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv, find_dotenv
2
+ from transformers import pipeline
3
+ from langchain import LLMChain, OpenAI, PromptTemplate
4
+
5
+ import requests
6
+ import os
7
+
8
+ # UI layer
9
+ import streamlit as st
10
+
11
+ load_dotenv(find_dotenv())
12
+
13
+ HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
14
+ # It involves 3 steps
15
+ # image to text
16
+ def image_to_text(url, use_api=True):
17
+ if use_api:
18
+ API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
19
+ headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
20
+
21
+ filename = url.split("/")[-1]
22
+ with open(filename, "rb") as f:
23
+ data = f.read()
24
+ response = requests.post(API_URL, headers=headers, data=data)
25
+ return response.json()[0]['generated_text']
26
+
27
+
28
+ # Download the model and use it, which is slow
29
+ captioner = pipeline("image-to-text",model="Salesforce/blip-image-captioning-base")
30
+ # captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
31
+ result = captioner(url)
32
+ return result[0]['generated_text']
33
+
34
+ ## [{'generated_text': 'two birds are standing next to each other '}]
35
+
36
+ # LLM
37
+ def generate_story(story_idea):
38
+ template = """
39
+ You are a professional story teller;
40
+ Generate a short story based on a simple narrative, the story should be no more than 50 words;
41
+ CONTEXT: {story_idea}
42
+ STORY:
43
+ """
44
+ prompt = PromptTemplate(input_variables=["story_idea"], template=template)
45
+
46
+ story_llm = LLMChain(llm=OpenAI(model_name='gpt-3.5-turbo-0301', temperature=1), prompt=prompt, verbose=True)
47
+ story = story_llm.run(story_idea)
48
+ return story
49
+
50
+ # text to speech
51
+ def text_to_speech(story):
52
+ API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
53
+ headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
54
+
55
+ payloads = {
56
+ "inputs": story
57
+ }
58
+
59
+ response = requests.post(API_URL, headers=headers, json=payloads)
60
+ with open("story_audio.flac", "wb") as file:
61
+ file.write(response.content)
62
+
63
+ # caption = image_to_text("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
64
+ # story = generate_story(story_idea="Two parrots singing a song")
65
+ # text_to_speech(story="Two parrots singing a song")
66
+
67
+ def main():
68
+ st.set_page_config(page_title="Upload any image to hear a nice story")
69
+
70
+ st.header("Listen to what your image has to tell you. JK DEMO APP")
71
+
72
+ uploaded_file = st.file_uploader("Choose an image...", type="jpg")
73
+ if uploaded_file is not None:
74
+ print(uploaded_file)
75
+ bytes_data = uploaded_file.getvalue()
76
+ with open(uploaded_file.name, "wb") as file:
77
+ file.write(bytes_data)
78
+
79
+ st.image(uploaded_file, caption="Uploaded image", use_column_width=True)
80
+
81
+ image_description = image_to_text(uploaded_file.name, use_api=True)
82
+
83
+ # Display image description on FE
84
+ with st.expander("Image Description"):
85
+ st.write(image_description)
86
+
87
+ story = generate_story(story_idea=image_description)
88
+ story_starter_text = "Yo ho Radio Nepal, prastut xa sun nai parne katha: "
89
+ story = story_starter_text + story
90
+
91
+ # Display story text on FE
92
+ with st.expander("Story"):
93
+ st.write(story)
94
+
95
+ # Display audio player on FE
96
+ text_to_speech(story=story)
97
+ st.audio("story_audio.flac")
98
+
99
+ if __name__ == '__main__':
100
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ python-dotenv
2
+ transformers
3
+ langchain
4
+ tensorflow
5
+ openai
6
+ streamlit