Spaces:

matanmichaely
/

image_to_audio_story

Running

App Files Files Community

image_to_audio_story / app.py

matanmichaely

Update app.py

3c37a29 almost 2 years ago

raw

history blame contribute delete

2.26 kB

	from dotenv import find_dotenv, load_dotenv
	from transformers import pipeline
	from transformers import AutoProcessor, AutoModel
	from langchain import PromptTemplate, LLMChain
	from langchain.llms import GooglePalm
	import scipy
	import streamlit as st

	load_dotenv(find_dotenv())

	# img2text
	def img_2_text(url):
	image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

	text = image_to_text(url)[0]["generated_text"]
	return text


	# llm
	def generate_story(scenario):
	template = """"
	You are a story teller;
	you can generate a creative fun story based on a sample narrative, the story should not be more than 100 words;
	CONTEXT: {scenario}
	STORY:
	"""

	prompt = PromptTemplate(template=template,
	input_variables=['scenario']
	)
	llm = GooglePalm(temperature=0.7)

	story_llm = LLMChain(llm=llm, prompt=prompt, verbose=True)

	story = story_llm.predict(scenario=scenario)

	return story


	#
	# text-to-speech
	def text_to_speech(text):
	processor = AutoProcessor.from_pretrained("suno/bark-small")
	model = AutoModel.from_pretrained("suno/bark-small")

	inputs = processor(
	text=[text],
	return_tensors="pt",
	)

	speech_values = model.generate(**inputs, do_sample=True)
	sampling_rate = model.generation_config.sample_rate
	scipy.io.wavfile.write("audio.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())


	def main():
	st.set_page_config(page_title="img 2 audio story")
	st.header("turn image to audio story")
	uploaded_file = st.file_uploader("Choose an image ... ", type="jpg")

	if uploaded_file is not None:
	print(uploaded_file)
	bytes_data = uploaded_file.getvalue()
	with open(uploaded_file.name, "wb") as file:
	file.write(bytes_data)
	st.image(uploaded_file, caption="Uploaded image", use_column_width=True)
	text = img_2_text(uploaded_file.name)
	story = generate_story(text)
	text_to_speech(story)

	with st.expander("text"):
	st.write(text)
	with st.expander("story"):
	st.write(story)
	st.audio("audio.wav")


	main()