from dotenv import load_dotenv, find_dotenv from transformers import pipeline from langchain import LLMChain, OpenAI, PromptTemplate import requests import os # UI layer import streamlit as st load_dotenv(find_dotenv()) HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') # It involves 3 steps # image to text def image_to_text(url, use_api=True): if use_api: API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large" headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"} filename = url.split("/")[-1] with open(filename, "rb") as f: data = f.read() response = requests.post(API_URL, headers=headers, data=data) return response.json()[0]['generated_text'] # Download the model and use it, which is slow captioner = pipeline("image-to-text",model="Salesforce/blip-image-captioning-base") # captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") result = captioner(url) return result[0]['generated_text'] ## [{'generated_text': 'two birds are standing next to each other '}] # LLM def generate_story(story_idea): # template = """ # You are a professional song writter; # Generate a song based on a simple narrative, the song should be no more than 100 words. # Song should be in Nepali language # CONTEXT: {story_idea} # STORY: # """ template = """ you are a song writer, write a song using following context: {story_idea}. Song should not be more than 150 words. It should be in English language. """ prompt = PromptTemplate(input_variables=["story_idea"], template=template) story_llm = LLMChain(llm=OpenAI(model_name='gpt-3.5-turbo-0301', temperature=1), prompt=prompt, verbose=True) story = story_llm.run(story_idea) return story # text to speech def text_to_speech(story): API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits" headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"} payloads = { "inputs": story } response = requests.post(API_URL, headers=headers, json=payloads) with open("story_audio.flac", "wb") as file: file.write(response.content) # caption = image_to_text("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") # story = generate_story(story_idea="Two parrots singing a song") # text_to_speech(story="Two parrots singing a song") def main(): st.set_page_config(page_title="Upload any image to hear a nice story") st.header("Listen to what your image has to tell you. JK DEMO APP") uploaded_file = st.file_uploader("Choose an image...", type="jpg") if uploaded_file is not None: print(uploaded_file) bytes_data = uploaded_file.getvalue() with open(uploaded_file.name, "wb") as file: file.write(bytes_data) st.image(uploaded_file, caption="Uploaded image", use_column_width=True) image_description = image_to_text(uploaded_file.name, use_api=True) # Display image description on FE with st.expander("Image Description"): st.write(image_description) story = generate_story(story_idea=image_description) # story_starter_text = "Yo ho Radio Nepal, prastut xa sun nai parne katha: " story_starter_text = "" story = story_starter_text + story # Display story text on FE with st.expander("Story"): st.write(story) # Display audio player on FE text_to_speech(story=story) st.audio("story_audio.flac") if __name__ == '__main__': main()