image-to-story / app.py
jitubutwal1441's picture
Generate story in english
55e017c
from dotenv import load_dotenv, find_dotenv
from transformers import pipeline
from langchain import LLMChain, OpenAI, PromptTemplate
import requests
import os
# UI layer
import streamlit as st
load_dotenv(find_dotenv())
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
# It involves 3 steps
# image to text
def image_to_text(url, use_api=True):
if use_api:
API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
filename = url.split("/")[-1]
with open(filename, "rb") as f:
data = f.read()
response = requests.post(API_URL, headers=headers, data=data)
return response.json()[0]['generated_text']
# Download the model and use it, which is slow
captioner = pipeline("image-to-text",model="Salesforce/blip-image-captioning-base")
# captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
result = captioner(url)
return result[0]['generated_text']
## [{'generated_text': 'two birds are standing next to each other '}]
# LLM
def generate_story(story_idea):
# template = """
# You are a professional song writter;
# Generate a song based on a simple narrative, the song should be no more than 100 words.
# Song should be in Nepali language
# CONTEXT: {story_idea}
# STORY:
# """
template = """
you are a song writer, write a song using following context:
{story_idea}.
Song should not be more than 150 words. It should be in English language.
"""
prompt = PromptTemplate(input_variables=["story_idea"], template=template)
story_llm = LLMChain(llm=OpenAI(model_name='gpt-3.5-turbo-0301', temperature=1), prompt=prompt, verbose=True)
story = story_llm.run(story_idea)
return story
# text to speech
def text_to_speech(story):
API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
payloads = {
"inputs": story
}
response = requests.post(API_URL, headers=headers, json=payloads)
with open("story_audio.flac", "wb") as file:
file.write(response.content)
# caption = image_to_text("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
# story = generate_story(story_idea="Two parrots singing a song")
# text_to_speech(story="Two parrots singing a song")
def main():
st.set_page_config(page_title="Upload any image to hear a nice story")
st.header("Listen to what your image has to tell you. JK DEMO APP")
uploaded_file = st.file_uploader("Choose an image...", type="jpg")
if uploaded_file is not None:
print(uploaded_file)
bytes_data = uploaded_file.getvalue()
with open(uploaded_file.name, "wb") as file:
file.write(bytes_data)
st.image(uploaded_file, caption="Uploaded image", use_column_width=True)
image_description = image_to_text(uploaded_file.name, use_api=True)
# Display image description on FE
with st.expander("Image Description"):
st.write(image_description)
story = generate_story(story_idea=image_description)
# story_starter_text = "Yo ho Radio Nepal, prastut xa sun nai parne katha: "
story_starter_text = ""
story = story_starter_text + story
# Display story text on FE
with st.expander("Story"):
st.write(story)
# Display audio player on FE
text_to_speech(story=story)
st.audio("story_audio.flac")
if __name__ == '__main__':
main()