GeorgiosIoannouCoder's picture
Update app.py
b04be94 verified
raw
history blame
6.48 kB
#############################################################################################################################
# Filename : app.py
# Description: A Streamlit application to turn an image to audio story.
# Author : Georgios Ioannou
#
# Copyright © 2024 by Georgios Ioannou
#############################################################################################################################
# Import libraries.
import os # Load environment variable(s).
import requests # Send HTTP GET request to Hugging Face models for inference.
import streamlit as st # Build the GUI of the application.
from langchain.chat_models import ChatOpenAI # Access to OpenAI gpt-3.5-turbo model.
from langchain.chains import LLMChain # Chain to run queries against LLMs.
# A prompt template. It accepts a set of parameters from the user that can be used to generate a prompt for a language model.
from langchain.prompts import PromptTemplate
from transformers import pipeline # Access to Hugging Face models.
#############################################################################################################################
# Load environment variable(s).
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
#############################################################################################################################
# Function to apply local CSS.
def local_css(file_name):
with open(file_name) as f:
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
#############################################################################################################################
# Return the text generated by the model for the image.
# Using pipeline.
def img_to_text(image_path):
# https://huggingface.co/tasks
# Task used here : "image-to-text".
# Model used here: "Salesforce/blip-image-captioning-base".
# Backup model: "nlpconnect/vit-gpt2-image-captioning".
image_to_text = pipeline(
"image-to-text", model="Salesforce/blip-image-captioning-base"
)
# image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
scenario = image_to_text(image_path)[0]["generated_text"]
return scenario
#############################################################################################################################
# Return the story generated by the model for the scenario.
# Using Langchain.
def generate_story(scenario, personality):
# Model used here: "gpt-3.5-turbo".
# The template can be customized to meet one's needs such as:
# Generate a story and generate lyrics of a song.
template = """
You are a story teller.
You must sound like {personality}.
The story should be less than 50 words.
Generate a story based on the above constraints and the following scenario: {scenario}.
"""
prompt = PromptTemplate(
template=template, input_variables=["scenario", "personality"]
)
story_llm = LLMChain(
llm=ChatOpenAI(
model_name="gpt-3.5-turbo", temperature=0
), # Increasing the temperature, the model becomes more creative and takes longer for inference.
prompt=prompt,
verbose=True, # Print intermediate values to the console.
)
story = story_llm.predict(
scenario=scenario, personality=personality
) # Format prompt with kwargs and pass to LLM.
return story
#############################################################################################################################
# Return the speech generated by the model for the story.
# Using inference api.
def text_to_speech(story):
# Model used here: "espnet/kan-bayashi_ljspeech_vits.
# Backup model: "facebook/mms-tts-eng".
API_URL = (
"https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
)
# API_URL = "https://api-inference.huggingface.co/models/facebook/mms-tts-eng"
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
payload = {"inputs": story}
response = requests.post(API_URL, headers=headers, json=payload)
with open("audio.flac", "wb") as file:
file.write(response.content)
#############################################################################################################################
# Main function to create the Streamlit web application.
def main():
try:
# Page title and favicon.
st.set_page_config(page_title="Image To Audio Story", page_icon="🖼️")
# Load CSS.
local_css("styles/style.css")
# Title.
title = f"""<h1 align="center" style="font-family: monospace; font-size: 2.1rem; margin-top: -6rem">
Turn Image to Audio Story</h1>"""
st.markdown(title, unsafe_allow_html=True)
# Define the personalities for the dropdown menu.
personalities = [
"Donald Trump",
"Abraham Lincoln",
"Aristotle",
"Cardi B",
"Kanye West",
]
personality = st.selectbox("Select a personality:", personalities)
# Upload an image.
uploaded_file = st.file_uploader("Choose an image:")
if uploaded_file is not None:
# Display the uploaded image.
bytes_data = uploaded_file.getvalue()
with open(uploaded_file.name, "wb") as file:
file.write(bytes_data)
st.image(uploaded_file, caption="Uploaded Image.", use_column_width=True)
with st.spinner(text="Model Inference..."): # Spinner to keep the application interactive.
# Model inference.
scenario = img_to_text(uploaded_file.name)
story = generate_story(scenario=scenario, personality=personality)
text_to_speech(story)
# Display the scenario and story.
with st.expander("Scenario"):
st.write(scenario)
with st.expander("Story"):
st.write(story)
# Display the audio.
st.audio("audio.flac")
except Exception as e:
# Display any errors.
st.error(e)
#############################################################################################################################
if __name__ == "__main__":
main()