Spaces:

Golali
/

dreamsteam

Build error

File size: 8,648 Bytes
import base64
import json
import random
import whisper
import gradio as gr 
WhisperModels = ['tiny', 'base', 'small', 'medium', 'large']
import matplotlib.pyplot as plt
import matplotlib
import requests
matplotlib.use('AGG')
import io
from PIL import Image
import PIL
from io import BytesIO
import openai
import os
openai.organization = os.getenv('organization')
openai.api_key = os.getenv('api_key')

def get_story(dream):
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=f"m going to tell you of my dream and i want you to make a better more and more detailed story out of in one json array so i can create a booklet with image generation. Can you split it into 4 sections and give it 3 keys: section= nr of section, story= containing the story, alt_text= the alt text(make sure that the alt text is overall consistent and map each person in it to a known movie character):{dream}",
    temperature=0.7,
    max_tokens=2048,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
    )
    return response["choices"][0]["text"]

def get_image(text):
    engine_id = "stable-diffusion-xl-beta-v2-2-2"
    api_host = "https://api.stability.ai"
    stability_key = os.getenv('stability_key')

    if stability_key is None:
        raise Exception("Missing Stability API key.")

    response = requests.post(
        f"{api_host}/v1/generation/{engine_id}/text-to-image",
        headers={
            "Content-Type": "application/json",
            "Accept": "application/json",
            "Authorization": f"Bearer {stability_key}"
        },
        json={
            "text_prompts": [
                {
                    "text": f"animated surreal with colors and creepy faces everything detailed, {text}"
                }
            ],
            "cfg_scale": 25,
            "clip_guidance_preset": "FAST_BLUE",
            "height": 512,
            "width": 512,
            "samples": 1,
            "steps": 50,
            "seed": 4294967295,
        },
    )

    if response.status_code != 200:
        raise Exception("Non-200 response: " + str(response.text))

    data = response.json()

    #To change:
    number = random.randint(0, 1000)

    with open(f"{number}.png", "wb") as f:
            f.write(base64.b64decode(data["artifacts"][0]["base64"]))

    return f"{number}.png"

def get_array(dream):
    json_start_index = dream.find("[")
        
    # Extract the JSON-formatted string from the original string
    json_string = dream[json_start_index:]

    # Parse the JSON-formatted string and convert it to a Python object
    my_object = json.loads(json_string)

    # Extract the JSON array from the Python object
    return my_object

def SpeechToText(audio, SelectedModel):
    print('Loading model...')
    model = whisper.load_model(SelectedModel)

    print('Loading audio...')
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    print('Creating log-mel spectrogram...')
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    print('Detecting language...')
    _, probs = model.detect_language(mel)
    lang = f"Language: {max(probs, key=probs.get)}"

    print('Decoding audio to text...')
    options = whisper.DecodingOptions(fp16 = False)
    result = whisper.decode(model, mel, options)

    text = get_story(result.text)
    print("Text: " + text)
    text = get_array(text)
    print(type(text))

    img1 = get_image(text[0]["alt_text"])
    text1 = text[0]["story"]
    print('image added')
    img2 = get_image(text[1]["alt_text"])
    text2 = text[1]["story"]
    print('image added')
    img3 = get_image(text[2]["alt_text"])
    text3 = text[2]["story"]
    print('image added')
    img4 = get_image(text[3]["alt_text"])
    text4 = text[3]["story"]
    print('image added')
    #carou = [ "./585.png"]   
    #text = "this is a test"

    return img1, img2, img3, img4, text1, text2, text3, text4

def clean_text(text):
    """

    we get rid of the commas and dots, maybe in the future there more things to get rid of in a sentence like !, ? ...

    Args:

        text (_type_): _description_

    Returns:

        _type_: _description_

    """
    print("cleaning text: ", text)
    text = text.lower()
    text = text.replace(",", " ")
    text = text.replace(".", " ")
    text = text.replace("?", " ")
    text = text.replace("-", " ")
    text = text.split()
    new_string = []
    for temp in text:
        if temp:
            if temp == "i":
                temp = "I"
            new_string.append(temp)
    concatString = ' '.join(new_string)
    return new_string, concatString

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.data.path.append('/root/nltk_data')
from nltk import pos_tag, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

class POS_tagging():
    def __init__(self, concatString):
        self.concatString = concatString
    def handle_conjugation(self, tags):
        # here we do the conjugation for verbs
        new_sentence = []
        for index, item in enumerate(tags):
            if item[1] not in ['VBP', 'DT', 'IN', 'TO', 'VBG', 'VBD', 'VBN', 'VBZ']:
                new_sentence.append(item[0])
            elif item[1] in ['VBP', 'VBG', 'VBD', 'VBN', 'VBZ']:
                new_verb = WordNetLemmatizer().lemmatize(item[0],'v')
                if new_verb != "be":
                    new_sentence.append(new_verb)
        return new_sentence
    def make_predictions(self):
        tags = pos_tag(word_tokenize(self.concatString))
        return self.handle_conjugation(tags)

def generate_pic(text_to_search, ax):


    """

    we define a function here to use the api frpm arasaac, and return the image based on the text that we search

    ref: https://arasaac.org/developers/api

    Args:

        text_to_search (_type_): _description_

        ax (_type_): _description_

    """
    search_url = f"https://api.arasaac.org/api/pictograms/en/bestsearch/{text_to_search}"
    search_response = requests.get(search_url)
    search_json = search_response.json()
    if search_json:
        pic_url = f"https://api.arasaac.org/api/pictograms/{search_json[0]['_id']}?download=false"
        pic_response = requests.get(pic_url)
        img = Image.open(BytesIO(pic_response.content))
        ax.imshow(img)
        ax.set_title(text_to_search)
    else:
        try:
            response = openai.Image.create(
              prompt=text_to_search,
              n=2,
              size="512x512"
            )
            image_url = response['data'][0]['url']
            image_response = requests.get(image_url)
            img = Image.open(BytesIO(image_response.content))
            ax.imshow(img)
            ax.set_title(f"/{text_to_search}/")
        except:
            ax.set_title("Error!")
    ax.axes.xaxis.set_visible(False)
    ax.axes.yaxis.set_visible(False)

with gr.Blocks(title="The Dream Steamer") as demo:
    gr.Markdown("# The Dream Steamer")
    gr.Markdown("This Application transforms your dreams into really cool pictures and makes it a more memorable experience.")
    gr.Markdown("With this application you can save your dreams and share them with your friends and family.")
    with gr.Row():
            audio = gr.Audio(label="Record your dream here",source="microphone", type="filepath")
    with gr.Row():    
            dropdown = gr.Dropdown(label="Whisper Model", choices=WhisperModels, value='base')
    with gr.Row():    
            btn1 = gr.Button("Show me my dream!")
    with gr.Column():
            with gr.Row():
                image1 = gr.Image(label="1", shape=(200,200))
                text1= gr.Text(label="1")
                image2 = gr.Image(label="2", shape=(200,200))
                text2= gr.Text(label="2")
            with gr.Row():
                image3 = gr.Image(label="3", shape=(200,200))
                text3= gr.Text(label="3")
                image4 = gr.Image(label="4", shape=(200,200))
                text4= gr.Text(label="4")

            btn1.click(SpeechToText, inputs=[audio, dropdown], outputs=[image1, image2, image3, image4, text1, text2, text3, text4])
   
  
    gr.Markdown("Made by the Dreamers [Alireza](https://github.com/golali) [Erfan](https://github.com/golchini) and [Omidreza](https://github.com/omidreza-amrollahi)")

demo.launch()