Spaces:

matanmichaely
/

image_to_audio_story

Running

File size: 2,256 Bytes

d7ef93f
d1b5c08
3c37a29
 
 
 
d1b5c08
 
d7ef93f
d1b5c08
3c37a29
 
d1b5c08
 
 
 
 
 
 
3c37a29
 
 
 
 
 
 
 
 
 
 
 
 
 
d1b5c08
3c37a29
 
 
d1b5c08
 
 
 
 
3c37a29
 
d1b5c08
3c37a29
 
 
 
d1b5c08
3c37a29
 
 
d1b5c08
 
 
3c37a29
d1b5c08
 
 
 
 
 
 
 
 
3c37a29
d1b5c08
 
 
 
 
 
 
3c37a29
 
d1b5c08
6f34f52

from dotenv import find_dotenv, load_dotenv
from transformers import pipeline
from transformers import AutoProcessor, AutoModel
from langchain import PromptTemplate, LLMChain
from langchain.llms import GooglePalm
import scipy
import streamlit as st

load_dotenv(find_dotenv())

# img2text
def img_2_text(url):
    image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

    text = image_to_text(url)[0]["generated_text"]
    return text


# llm
def generate_story(scenario):
    template = """"
        You are a story teller;
        you can generate a creative fun story based on a sample narrative, the story should not be more than 100 words;
        CONTEXT: {scenario}
        STORY: 
        """

    prompt = PromptTemplate(template=template,
                            input_variables=['scenario']
                            )
    llm = GooglePalm(temperature=0.7)

    story_llm = LLMChain(llm=llm, prompt=prompt, verbose=True)

    story = story_llm.predict(scenario=scenario)

    return story


#
# text-to-speech
def text_to_speech(text):
    processor = AutoProcessor.from_pretrained("suno/bark-small")
    model = AutoModel.from_pretrained("suno/bark-small")

    inputs = processor(
        text=[text],
        return_tensors="pt",
    )

    speech_values = model.generate(**inputs, do_sample=True)
    sampling_rate = model.generation_config.sample_rate
    scipy.io.wavfile.write("audio.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())


def main():
    st.set_page_config(page_title="img 2 audio story")
    st.header("turn image to audio story")
    uploaded_file = st.file_uploader("Choose an image ... ", type="jpg")

    if uploaded_file is not None:
        print(uploaded_file)
        bytes_data = uploaded_file.getvalue()
        with open(uploaded_file.name, "wb") as file:
            file.write(bytes_data)
        st.image(uploaded_file, caption="Uploaded image", use_column_width=True)
        text = img_2_text(uploaded_file.name)
        story = generate_story(text)
        text_to_speech(story)

        with st.expander("text"):
            st.write(text)
        with st.expander("story"):
            st.write(story)
        st.audio("audio.wav")


main()