import base64 import json import random import whisper import gradio as gr WhisperModels = ['tiny', 'base', 'small', 'medium', 'large'] import matplotlib.pyplot as plt import matplotlib import requests matplotlib.use('AGG') import io from PIL import Image import PIL from io import BytesIO import openai import os openai.organization = os.getenv('organization') openai.api_key = os.getenv('api_key') def get_story(dream): response = openai.Completion.create( model="text-davinci-003", prompt=f"m going to tell you of my dream and i want you to make a better more and more detailed story out of in one json array so i can create a booklet with image generation. Can you split it into 4 sections and give it 3 keys: section= nr of section, story= containing the story, alt_text= the alt text(make sure that the alt text is overall consistent and map each person in it to a known movie character):{dream}", temperature=0.7, max_tokens=2048, top_p=1, frequency_penalty=0, presence_penalty=0 ) return response["choices"][0]["text"] def get_image(text): engine_id = "stable-diffusion-xl-beta-v2-2-2" api_host = "https://api.stability.ai" stability_key = os.getenv('stability_key') if stability_key is None: raise Exception("Missing Stability API key.") response = requests.post( f"{api_host}/v1/generation/{engine_id}/text-to-image", headers={ "Content-Type": "application/json", "Accept": "application/json", "Authorization": f"Bearer {stability_key}" }, json={ "text_prompts": [ { "text": f"animated surreal with colors and creepy faces everything detailed, {text}" } ], "cfg_scale": 25, "clip_guidance_preset": "FAST_BLUE", "height": 512, "width": 512, "samples": 1, "steps": 50, "seed": 4294967295, }, ) if response.status_code != 200: raise Exception("Non-200 response: " + str(response.text)) data = response.json() #To change: number = random.randint(0, 1000) with open(f"{number}.png", "wb") as f: f.write(base64.b64decode(data["artifacts"][0]["base64"])) return f"{number}.png" def get_array(dream): json_start_index = dream.find("[") # Extract the JSON-formatted string from the original string json_string = dream[json_start_index:] # Parse the JSON-formatted string and convert it to a Python object my_object = json.loads(json_string) # Extract the JSON array from the Python object return my_object def SpeechToText(audio, SelectedModel): print('Loading model...') model = whisper.load_model(SelectedModel) print('Loading audio...') audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) print('Creating log-mel spectrogram...') mel = whisper.log_mel_spectrogram(audio).to(model.device) print('Detecting language...') _, probs = model.detect_language(mel) lang = f"Language: {max(probs, key=probs.get)}" print('Decoding audio to text...') options = whisper.DecodingOptions(fp16 = False) result = whisper.decode(model, mel, options) text = get_story(result.text) print("Text: " + text) text = get_array(text) print(type(text)) img1 = get_image(text[0]["alt_text"]) text1 = text[0]["story"] print('image added') img2 = get_image(text[1]["alt_text"]) text2 = text[1]["story"] print('image added') img3 = get_image(text[2]["alt_text"]) text3 = text[2]["story"] print('image added') img4 = get_image(text[3]["alt_text"]) text4 = text[3]["story"] print('image added') #carou = [ "./585.png"] #text = "this is a test" return img1, img2, img3, img4, text1, text2, text3, text4 def clean_text(text): """ we get rid of the commas and dots, maybe in the future there more things to get rid of in a sentence like !, ? ... Args: text (_type_): _description_ Returns: _type_: _description_ """ print("cleaning text: ", text) text = text.lower() text = text.replace(",", " ") text = text.replace(".", " ") text = text.replace("?", " ") text = text.replace("-", " ") text = text.split() new_string = [] for temp in text: if temp: if temp == "i": temp = "I" new_string.append(temp) concatString = ' '.join(new_string) return new_string, concatString import nltk nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') nltk.download('omw-1.4') nltk.data.path.append('/root/nltk_data') from nltk import pos_tag, word_tokenize from nltk.stem.wordnet import WordNetLemmatizer class POS_tagging(): def __init__(self, concatString): self.concatString = concatString def handle_conjugation(self, tags): # here we do the conjugation for verbs new_sentence = [] for index, item in enumerate(tags): if item[1] not in ['VBP', 'DT', 'IN', 'TO', 'VBG', 'VBD', 'VBN', 'VBZ']: new_sentence.append(item[0]) elif item[1] in ['VBP', 'VBG', 'VBD', 'VBN', 'VBZ']: new_verb = WordNetLemmatizer().lemmatize(item[0],'v') if new_verb != "be": new_sentence.append(new_verb) return new_sentence def make_predictions(self): tags = pos_tag(word_tokenize(self.concatString)) return self.handle_conjugation(tags) def generate_pic(text_to_search, ax): """ we define a function here to use the api frpm arasaac, and return the image based on the text that we search ref: https://arasaac.org/developers/api Args: text_to_search (_type_): _description_ ax (_type_): _description_ """ search_url = f"https://api.arasaac.org/api/pictograms/en/bestsearch/{text_to_search}" search_response = requests.get(search_url) search_json = search_response.json() if search_json: pic_url = f"https://api.arasaac.org/api/pictograms/{search_json[0]['_id']}?download=false" pic_response = requests.get(pic_url) img = Image.open(BytesIO(pic_response.content)) ax.imshow(img) ax.set_title(text_to_search) else: try: response = openai.Image.create( prompt=text_to_search, n=2, size="512x512" ) image_url = response['data'][0]['url'] image_response = requests.get(image_url) img = Image.open(BytesIO(image_response.content)) ax.imshow(img) ax.set_title(f"/{text_to_search}/") except: ax.set_title("Error!") ax.axes.xaxis.set_visible(False) ax.axes.yaxis.set_visible(False) with gr.Blocks(title="The Dream Steamer") as demo: gr.Markdown("# The Dream Steamer") gr.Markdown("This Application transforms your dreams into really cool pictures and makes it a more memorable experience.") gr.Markdown("With this application you can save your dreams and share them with your friends and family.") with gr.Row(): audio = gr.Audio(label="Record your dream here",source="microphone", type="filepath") with gr.Row(): dropdown = gr.Dropdown(label="Whisper Model", choices=WhisperModels, value='base') with gr.Row(): btn1 = gr.Button("Show me my dream!") with gr.Column(): with gr.Row(): image1 = gr.Image(label="1", shape=(200,200)) text1= gr.Text(label="1") image2 = gr.Image(label="2", shape=(200,200)) text2= gr.Text(label="2") with gr.Row(): image3 = gr.Image(label="3", shape=(200,200)) text3= gr.Text(label="3") image4 = gr.Image(label="4", shape=(200,200)) text4= gr.Text(label="4") btn1.click(SpeechToText, inputs=[audio, dropdown], outputs=[image1, image2, image3, image4, text1, text2, text3, text4]) gr.Markdown("Made by the Dreamers [Alireza](https://github.com/golali) [Erfan](https://github.com/golchini) and [Omidreza](https://github.com/omidreza-amrollahi)") demo.launch()