dreamsteam / app.py
Golali's picture
init commit
2470687
import base64
import json
import random
import whisper
import gradio as gr
WhisperModels = ['tiny', 'base', 'small', 'medium', 'large']
import matplotlib.pyplot as plt
import matplotlib
import requests
matplotlib.use('AGG')
import io
from PIL import Image
import PIL
from io import BytesIO
import openai
import os
openai.organization = os.getenv('organization')
openai.api_key = os.getenv('api_key')
def get_story(dream):
response = openai.Completion.create(
model="text-davinci-003",
prompt=f"m going to tell you of my dream and i want you to make a better more and more detailed story out of in one json array so i can create a booklet with image generation. Can you split it into 4 sections and give it 3 keys: section= nr of section, story= containing the story, alt_text= the alt text(make sure that the alt text is overall consistent and map each person in it to a known movie character):{dream}",
temperature=0.7,
max_tokens=2048,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
return response["choices"][0]["text"]
def get_image(text):
engine_id = "stable-diffusion-xl-beta-v2-2-2"
api_host = "https://api.stability.ai"
stability_key = os.getenv('stability_key')
if stability_key is None:
raise Exception("Missing Stability API key.")
response = requests.post(
f"{api_host}/v1/generation/{engine_id}/text-to-image",
headers={
"Content-Type": "application/json",
"Accept": "application/json",
"Authorization": f"Bearer {stability_key}"
},
json={
"text_prompts": [
{
"text": f"animated surreal with colors and creepy faces everything detailed, {text}"
}
],
"cfg_scale": 25,
"clip_guidance_preset": "FAST_BLUE",
"height": 512,
"width": 512,
"samples": 1,
"steps": 50,
"seed": 4294967295,
},
)
if response.status_code != 200:
raise Exception("Non-200 response: " + str(response.text))
data = response.json()
#To change:
number = random.randint(0, 1000)
with open(f"{number}.png", "wb") as f:
f.write(base64.b64decode(data["artifacts"][0]["base64"]))
return f"{number}.png"
def get_array(dream):
json_start_index = dream.find("[")
# Extract the JSON-formatted string from the original string
json_string = dream[json_start_index:]
# Parse the JSON-formatted string and convert it to a Python object
my_object = json.loads(json_string)
# Extract the JSON array from the Python object
return my_object
def SpeechToText(audio, SelectedModel):
print('Loading model...')
model = whisper.load_model(SelectedModel)
print('Loading audio...')
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
print('Creating log-mel spectrogram...')
mel = whisper.log_mel_spectrogram(audio).to(model.device)
print('Detecting language...')
_, probs = model.detect_language(mel)
lang = f"Language: {max(probs, key=probs.get)}"
print('Decoding audio to text...')
options = whisper.DecodingOptions(fp16 = False)
result = whisper.decode(model, mel, options)
text = get_story(result.text)
print("Text: " + text)
text = get_array(text)
print(type(text))
img1 = get_image(text[0]["alt_text"])
text1 = text[0]["story"]
print('image added')
img2 = get_image(text[1]["alt_text"])
text2 = text[1]["story"]
print('image added')
img3 = get_image(text[2]["alt_text"])
text3 = text[2]["story"]
print('image added')
img4 = get_image(text[3]["alt_text"])
text4 = text[3]["story"]
print('image added')
#carou = [ "./585.png"]
#text = "this is a test"
return img1, img2, img3, img4, text1, text2, text3, text4
def clean_text(text):
"""
we get rid of the commas and dots, maybe in the future there more things to get rid of in a sentence like !, ? ...
Args:
text (_type_): _description_
Returns:
_type_: _description_
"""
print("cleaning text: ", text)
text = text.lower()
text = text.replace(",", " ")
text = text.replace(".", " ")
text = text.replace("?", " ")
text = text.replace("-", " ")
text = text.split()
new_string = []
for temp in text:
if temp:
if temp == "i":
temp = "I"
new_string.append(temp)
concatString = ' '.join(new_string)
return new_string, concatString
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.data.path.append('/root/nltk_data')
from nltk import pos_tag, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
class POS_tagging():
def __init__(self, concatString):
self.concatString = concatString
def handle_conjugation(self, tags):
# here we do the conjugation for verbs
new_sentence = []
for index, item in enumerate(tags):
if item[1] not in ['VBP', 'DT', 'IN', 'TO', 'VBG', 'VBD', 'VBN', 'VBZ']:
new_sentence.append(item[0])
elif item[1] in ['VBP', 'VBG', 'VBD', 'VBN', 'VBZ']:
new_verb = WordNetLemmatizer().lemmatize(item[0],'v')
if new_verb != "be":
new_sentence.append(new_verb)
return new_sentence
def make_predictions(self):
tags = pos_tag(word_tokenize(self.concatString))
return self.handle_conjugation(tags)
def generate_pic(text_to_search, ax):
"""
we define a function here to use the api frpm arasaac, and return the image based on the text that we search
ref: https://arasaac.org/developers/api
Args:
text_to_search (_type_): _description_
ax (_type_): _description_
"""
search_url = f"https://api.arasaac.org/api/pictograms/en/bestsearch/{text_to_search}"
search_response = requests.get(search_url)
search_json = search_response.json()
if search_json:
pic_url = f"https://api.arasaac.org/api/pictograms/{search_json[0]['_id']}?download=false"
pic_response = requests.get(pic_url)
img = Image.open(BytesIO(pic_response.content))
ax.imshow(img)
ax.set_title(text_to_search)
else:
try:
response = openai.Image.create(
prompt=text_to_search,
n=2,
size="512x512"
)
image_url = response['data'][0]['url']
image_response = requests.get(image_url)
img = Image.open(BytesIO(image_response.content))
ax.imshow(img)
ax.set_title(f"/{text_to_search}/")
except:
ax.set_title("Error!")
ax.axes.xaxis.set_visible(False)
ax.axes.yaxis.set_visible(False)
with gr.Blocks(title="The Dream Steamer") as demo:
gr.Markdown("# The Dream Steamer")
gr.Markdown("This Application transforms your dreams into really cool pictures and makes it a more memorable experience.")
gr.Markdown("With this application you can save your dreams and share them with your friends and family.")
with gr.Row():
audio = gr.Audio(label="Record your dream here",source="microphone", type="filepath")
with gr.Row():
dropdown = gr.Dropdown(label="Whisper Model", choices=WhisperModels, value='base')
with gr.Row():
btn1 = gr.Button("Show me my dream!")
with gr.Column():
with gr.Row():
image1 = gr.Image(label="1", shape=(200,200))
text1= gr.Text(label="1")
image2 = gr.Image(label="2", shape=(200,200))
text2= gr.Text(label="2")
with gr.Row():
image3 = gr.Image(label="3", shape=(200,200))
text3= gr.Text(label="3")
image4 = gr.Image(label="4", shape=(200,200))
text4= gr.Text(label="4")
btn1.click(SpeechToText, inputs=[audio, dropdown], outputs=[image1, image2, image3, image4, text1, text2, text3, text4])
gr.Markdown("Made by the Dreamers [Alireza](https://github.com/golali) [Erfan](https://github.com/golchini) and [Omidreza](https://github.com/omidreza-amrollahi)")
demo.launch()