Spaces:

GeorgiosIoannouCoder
/

cuny-tech-prep-tutorial-1

Running

App Files Files Community

cuny-tech-prep-tutorial-1 / app.py

GeorgiosIoannouCoder

Update app.py

b04be94 verified about 1 year ago

raw

history blame

6.48 kB

	#############################################################################################################################
	# Filename : app.py
	# Description: A Streamlit application to turn an image to audio story.
	# Author : Georgios Ioannou
	#
	# Copyright © 2024 by Georgios Ioannou
	#############################################################################################################################
	# Import libraries.


	import os # Load environment variable(s).
	import requests # Send HTTP GET request to Hugging Face models for inference.
	import streamlit as st # Build the GUI of the application.

	from langchain.chat_models import ChatOpenAI # Access to OpenAI gpt-3.5-turbo model.
	from langchain.chains import LLMChain # Chain to run queries against LLMs.
	# A prompt template. It accepts a set of parameters from the user that can be used to generate a prompt for a language model.
	from langchain.prompts import PromptTemplate
	from transformers import pipeline # Access to Hugging Face models.


	#############################################################################################################################
	# Load environment variable(s).

	HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")


	#############################################################################################################################
	# Function to apply local CSS.


	def local_css(file_name):
	with open(file_name) as f:
	st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)


	#############################################################################################################################
	# Return the text generated by the model for the image.
	# Using pipeline.


	def img_to_text(image_path):
	# https://huggingface.co/tasks
	# Task used here : "image-to-text".
	# Model used here: "Salesforce/blip-image-captioning-base".
	# Backup model: "nlpconnect/vit-gpt2-image-captioning".

	image_to_text = pipeline(
	"image-to-text", model="Salesforce/blip-image-captioning-base"
	)
	# image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

	scenario = image_to_text(image_path)[0]["generated_text"]

	return scenario


	#############################################################################################################################
	# Return the story generated by the model for the scenario.
	# Using Langchain.


	def generate_story(scenario, personality):
	# Model used here: "gpt-3.5-turbo".

	# The template can be customized to meet one's needs such as:
	# Generate a story and generate lyrics of a song.

	template = """
	You are a story teller.
	You must sound like {personality}.
	The story should be less than 50 words.
	Generate a story based on the above constraints and the following scenario: {scenario}.
	"""

	prompt = PromptTemplate(
	template=template, input_variables=["scenario", "personality"]
	)

	story_llm = LLMChain(
	llm=ChatOpenAI(
	model_name="gpt-3.5-turbo", temperature=0
	), # Increasing the temperature, the model becomes more creative and takes longer for inference.
	prompt=prompt,
	verbose=True, # Print intermediate values to the console.
	)

	story = story_llm.predict(
	scenario=scenario, personality=personality
	) # Format prompt with kwargs and pass to LLM.

	return story


	#############################################################################################################################
	# Return the speech generated by the model for the story.
	# Using inference api.


	def text_to_speech(story):
	# Model used here: "espnet/kan-bayashi_ljspeech_vits.
	# Backup model: "facebook/mms-tts-eng".

	API_URL = (
	"https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
	)
	# API_URL = "https://api-inference.huggingface.co/models/facebook/mms-tts-eng"

	headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}

	payload = {"inputs": story}

	response = requests.post(API_URL, headers=headers, json=payload)

	with open("audio.flac", "wb") as file:
	file.write(response.content)


	#############################################################################################################################
	# Main function to create the Streamlit web application.


	def main():
	try:
	# Page title and favicon.

	st.set_page_config(page_title="Image To Audio Story", page_icon="🖼️")

	# Load CSS.

	local_css("styles/style.css")

	# Title.

	title = f"""<h1 align="center" style="font-family: monospace; font-size: 2.1rem; margin-top: -6rem">
	Turn Image to Audio Story</h1>"""
	st.markdown(title, unsafe_allow_html=True)

	# Define the personalities for the dropdown menu.

	personalities = [
	"Donald Trump",
	"Abraham Lincoln",
	"Aristotle",
	"Cardi B",
	"Kanye West",
	]
	personality = st.selectbox("Select a personality:", personalities)

	# Upload an image.

	uploaded_file = st.file_uploader("Choose an image:")

	if uploaded_file is not None:
	# Display the uploaded image.

	bytes_data = uploaded_file.getvalue()
	with open(uploaded_file.name, "wb") as file:
	file.write(bytes_data)
	st.image(uploaded_file, caption="Uploaded Image.", use_column_width=True)

	with st.spinner(text="Model Inference..."): # Spinner to keep the application interactive.
	# Model inference.

	scenario = img_to_text(uploaded_file.name)
	story = generate_story(scenario=scenario, personality=personality)
	text_to_speech(story)

	# Display the scenario and story.

	with st.expander("Scenario"):
	st.write(scenario)
	with st.expander("Story"):
	st.write(story)

	# Display the audio.

	st.audio("audio.flac")
	except Exception as e:
	# Display any errors.

	st.error(e)


	#############################################################################################################################


	if __name__ == "__main__":
	main()