File size: 2,256 Bytes
d7ef93f
d1b5c08
3c37a29
 
 
 
d1b5c08
 
d7ef93f
d1b5c08
3c37a29
 
d1b5c08
 
 
 
 
 
 
3c37a29
 
 
 
 
 
 
 
 
 
 
 
 
 
d1b5c08
3c37a29
 
 
d1b5c08
 
 
 
 
3c37a29
 
d1b5c08
3c37a29
 
 
 
d1b5c08
3c37a29
 
 
d1b5c08
 
 
3c37a29
d1b5c08
 
 
 
 
 
 
 
 
3c37a29
d1b5c08
 
 
 
 
 
 
3c37a29
 
d1b5c08
6f34f52
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from dotenv import find_dotenv, load_dotenv
from transformers import pipeline
from transformers import AutoProcessor, AutoModel
from langchain import PromptTemplate, LLMChain
from langchain.llms import GooglePalm
import scipy
import streamlit as st

load_dotenv(find_dotenv())

# img2text
def img_2_text(url):
    image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

    text = image_to_text(url)[0]["generated_text"]
    return text


# llm
def generate_story(scenario):
    template = """"
        You are a story teller;
        you can generate a creative fun story based on a sample narrative, the story should not be more than 100 words;
        CONTEXT: {scenario}
        STORY: 
        """

    prompt = PromptTemplate(template=template,
                            input_variables=['scenario']
                            )
    llm = GooglePalm(temperature=0.7)

    story_llm = LLMChain(llm=llm, prompt=prompt, verbose=True)

    story = story_llm.predict(scenario=scenario)

    return story


#
# text-to-speech
def text_to_speech(text):
    processor = AutoProcessor.from_pretrained("suno/bark-small")
    model = AutoModel.from_pretrained("suno/bark-small")

    inputs = processor(
        text=[text],
        return_tensors="pt",
    )

    speech_values = model.generate(**inputs, do_sample=True)
    sampling_rate = model.generation_config.sample_rate
    scipy.io.wavfile.write("audio.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())


def main():
    st.set_page_config(page_title="img 2 audio story")
    st.header("turn image to audio story")
    uploaded_file = st.file_uploader("Choose an image ... ", type="jpg")

    if uploaded_file is not None:
        print(uploaded_file)
        bytes_data = uploaded_file.getvalue()
        with open(uploaded_file.name, "wb") as file:
            file.write(bytes_data)
        st.image(uploaded_file, caption="Uploaded image", use_column_width=True)
        text = img_2_text(uploaded_file.name)
        story = generate_story(text)
        text_to_speech(story)

        with st.expander("text"):
            st.write(text)
        with st.expander("story"):
            st.write(story)
        st.audio("audio.wav")


main()