Spaces:
Build error
Build error
init commit
Browse files- README.md +4 -12
- app.py +243 -0
- packages.txt +1 -0
- requirements.txt +11 -0
README.md
CHANGED
@@ -1,13 +1,5 @@
|
|
1 |
-
|
2 |
-
title: Dreamsteam
|
3 |
-
emoji: 📊
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: yellow
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 3.27.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: cc-by-nc-nd-4.0
|
11 |
-
---
|
12 |
|
13 |
-
|
|
|
|
|
|
1 |
+
# Speech To Pictogram
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
|
4 |
+
## deployed to:
|
5 |
+
https://huggingface.co/spaces/omidreza/speechtopictogram
|
app.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import json
|
3 |
+
import random
|
4 |
+
import whisper
|
5 |
+
import gradio as gr
|
6 |
+
WhisperModels = ['tiny', 'base', 'small', 'medium', 'large']
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import matplotlib
|
9 |
+
import requests
|
10 |
+
matplotlib.use('AGG')
|
11 |
+
import io
|
12 |
+
from PIL import Image
|
13 |
+
import PIL
|
14 |
+
from io import BytesIO
|
15 |
+
import openai
|
16 |
+
import os
|
17 |
+
openai.organization = os.getenv('organization')
|
18 |
+
openai.api_key = os.getenv('api_key')
|
19 |
+
|
20 |
+
def get_story(dream):
|
21 |
+
response = openai.Completion.create(
|
22 |
+
model="text-davinci-003",
|
23 |
+
prompt=f"m going to tell you of my dream and i want you to make a better more and more detailed story out of in one json array so i can create a booklet with image generation. Can you split it into 4 sections and give it 3 keys: section= nr of section, story= containing the story, alt_text= the alt text(make sure that the alt text is overall consistent and map each person in it to a known movie character):{dream}",
|
24 |
+
temperature=0.7,
|
25 |
+
max_tokens=2048,
|
26 |
+
top_p=1,
|
27 |
+
frequency_penalty=0,
|
28 |
+
presence_penalty=0
|
29 |
+
)
|
30 |
+
return response["choices"][0]["text"]
|
31 |
+
|
32 |
+
def get_image(text):
|
33 |
+
engine_id = "stable-diffusion-xl-beta-v2-2-2"
|
34 |
+
api_host = "https://api.stability.ai"
|
35 |
+
stability_key = os.getenv('stability_key')
|
36 |
+
|
37 |
+
if stability_key is None:
|
38 |
+
raise Exception("Missing Stability API key.")
|
39 |
+
|
40 |
+
response = requests.post(
|
41 |
+
f"{api_host}/v1/generation/{engine_id}/text-to-image",
|
42 |
+
headers={
|
43 |
+
"Content-Type": "application/json",
|
44 |
+
"Accept": "application/json",
|
45 |
+
"Authorization": f"Bearer {stability_key}"
|
46 |
+
},
|
47 |
+
json={
|
48 |
+
"text_prompts": [
|
49 |
+
{
|
50 |
+
"text": f"animated surreal with colors and creepy faces everything detailed, {text}"
|
51 |
+
}
|
52 |
+
],
|
53 |
+
"cfg_scale": 25,
|
54 |
+
"clip_guidance_preset": "FAST_BLUE",
|
55 |
+
"height": 512,
|
56 |
+
"width": 512,
|
57 |
+
"samples": 1,
|
58 |
+
"steps": 50,
|
59 |
+
"seed": 4294967295,
|
60 |
+
},
|
61 |
+
)
|
62 |
+
|
63 |
+
if response.status_code != 200:
|
64 |
+
raise Exception("Non-200 response: " + str(response.text))
|
65 |
+
|
66 |
+
data = response.json()
|
67 |
+
|
68 |
+
#To change:
|
69 |
+
number = random.randint(0, 1000)
|
70 |
+
|
71 |
+
with open(f"{number}.png", "wb") as f:
|
72 |
+
f.write(base64.b64decode(data["artifacts"][0]["base64"]))
|
73 |
+
|
74 |
+
return f"{number}.png"
|
75 |
+
|
76 |
+
def get_array(dream):
|
77 |
+
json_start_index = dream.find("[")
|
78 |
+
|
79 |
+
# Extract the JSON-formatted string from the original string
|
80 |
+
json_string = dream[json_start_index:]
|
81 |
+
|
82 |
+
# Parse the JSON-formatted string and convert it to a Python object
|
83 |
+
my_object = json.loads(json_string)
|
84 |
+
|
85 |
+
# Extract the JSON array from the Python object
|
86 |
+
return my_object
|
87 |
+
|
88 |
+
def SpeechToText(audio, SelectedModel):
|
89 |
+
print('Loading model...')
|
90 |
+
model = whisper.load_model(SelectedModel)
|
91 |
+
|
92 |
+
print('Loading audio...')
|
93 |
+
audio = whisper.load_audio(audio)
|
94 |
+
audio = whisper.pad_or_trim(audio)
|
95 |
+
|
96 |
+
print('Creating log-mel spectrogram...')
|
97 |
+
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
98 |
+
|
99 |
+
print('Detecting language...')
|
100 |
+
_, probs = model.detect_language(mel)
|
101 |
+
lang = f"Language: {max(probs, key=probs.get)}"
|
102 |
+
|
103 |
+
print('Decoding audio to text...')
|
104 |
+
options = whisper.DecodingOptions(fp16 = False)
|
105 |
+
result = whisper.decode(model, mel, options)
|
106 |
+
|
107 |
+
text = get_story(result.text)
|
108 |
+
print("Text: " + text)
|
109 |
+
text = get_array(text)
|
110 |
+
print(type(text))
|
111 |
+
|
112 |
+
img1 = get_image(text[0]["alt_text"])
|
113 |
+
text1 = text[0]["story"]
|
114 |
+
print('image added')
|
115 |
+
img2 = get_image(text[1]["alt_text"])
|
116 |
+
text2 = text[1]["story"]
|
117 |
+
print('image added')
|
118 |
+
img3 = get_image(text[2]["alt_text"])
|
119 |
+
text3 = text[2]["story"]
|
120 |
+
print('image added')
|
121 |
+
img4 = get_image(text[3]["alt_text"])
|
122 |
+
text4 = text[3]["story"]
|
123 |
+
print('image added')
|
124 |
+
#carou = [ "./585.png"]
|
125 |
+
#text = "this is a test"
|
126 |
+
|
127 |
+
return img1, img2, img3, img4, text1, text2, text3, text4
|
128 |
+
|
129 |
+
def clean_text(text):
|
130 |
+
"""
|
131 |
+
we get rid of the commas and dots, maybe in the future there more things to get rid of in a sentence like !, ? ...
|
132 |
+
Args:
|
133 |
+
text (_type_): _description_
|
134 |
+
Returns:
|
135 |
+
_type_: _description_
|
136 |
+
"""
|
137 |
+
print("cleaning text: ", text)
|
138 |
+
text = text.lower()
|
139 |
+
text = text.replace(",", " ")
|
140 |
+
text = text.replace(".", " ")
|
141 |
+
text = text.replace("?", " ")
|
142 |
+
text = text.replace("-", " ")
|
143 |
+
text = text.split()
|
144 |
+
new_string = []
|
145 |
+
for temp in text:
|
146 |
+
if temp:
|
147 |
+
if temp == "i":
|
148 |
+
temp = "I"
|
149 |
+
new_string.append(temp)
|
150 |
+
concatString = ' '.join(new_string)
|
151 |
+
return new_string, concatString
|
152 |
+
|
153 |
+
import nltk
|
154 |
+
nltk.download('punkt')
|
155 |
+
nltk.download('averaged_perceptron_tagger')
|
156 |
+
nltk.download('wordnet')
|
157 |
+
nltk.download('omw-1.4')
|
158 |
+
nltk.data.path.append('/root/nltk_data')
|
159 |
+
from nltk import pos_tag, word_tokenize
|
160 |
+
from nltk.stem.wordnet import WordNetLemmatizer
|
161 |
+
|
162 |
+
class POS_tagging():
|
163 |
+
def __init__(self, concatString):
|
164 |
+
self.concatString = concatString
|
165 |
+
def handle_conjugation(self, tags):
|
166 |
+
# here we do the conjugation for verbs
|
167 |
+
new_sentence = []
|
168 |
+
for index, item in enumerate(tags):
|
169 |
+
if item[1] not in ['VBP', 'DT', 'IN', 'TO', 'VBG', 'VBD', 'VBN', 'VBZ']:
|
170 |
+
new_sentence.append(item[0])
|
171 |
+
elif item[1] in ['VBP', 'VBG', 'VBD', 'VBN', 'VBZ']:
|
172 |
+
new_verb = WordNetLemmatizer().lemmatize(item[0],'v')
|
173 |
+
if new_verb != "be":
|
174 |
+
new_sentence.append(new_verb)
|
175 |
+
return new_sentence
|
176 |
+
def make_predictions(self):
|
177 |
+
tags = pos_tag(word_tokenize(self.concatString))
|
178 |
+
return self.handle_conjugation(tags)
|
179 |
+
|
180 |
+
def generate_pic(text_to_search, ax):
|
181 |
+
|
182 |
+
|
183 |
+
"""
|
184 |
+
we define a function here to use the api frpm arasaac, and return the image based on the text that we search
|
185 |
+
ref: https://arasaac.org/developers/api
|
186 |
+
Args:
|
187 |
+
text_to_search (_type_): _description_
|
188 |
+
ax (_type_): _description_
|
189 |
+
"""
|
190 |
+
search_url = f"https://api.arasaac.org/api/pictograms/en/bestsearch/{text_to_search}"
|
191 |
+
search_response = requests.get(search_url)
|
192 |
+
search_json = search_response.json()
|
193 |
+
if search_json:
|
194 |
+
pic_url = f"https://api.arasaac.org/api/pictograms/{search_json[0]['_id']}?download=false"
|
195 |
+
pic_response = requests.get(pic_url)
|
196 |
+
img = Image.open(BytesIO(pic_response.content))
|
197 |
+
ax.imshow(img)
|
198 |
+
ax.set_title(text_to_search)
|
199 |
+
else:
|
200 |
+
try:
|
201 |
+
response = openai.Image.create(
|
202 |
+
prompt=text_to_search,
|
203 |
+
n=2,
|
204 |
+
size="512x512"
|
205 |
+
)
|
206 |
+
image_url = response['data'][0]['url']
|
207 |
+
image_response = requests.get(image_url)
|
208 |
+
img = Image.open(BytesIO(image_response.content))
|
209 |
+
ax.imshow(img)
|
210 |
+
ax.set_title(f"/{text_to_search}/")
|
211 |
+
except:
|
212 |
+
ax.set_title("Error!")
|
213 |
+
ax.axes.xaxis.set_visible(False)
|
214 |
+
ax.axes.yaxis.set_visible(False)
|
215 |
+
|
216 |
+
with gr.Blocks(title="The Dream Steamer") as demo:
|
217 |
+
gr.Markdown("# The Dream Steamer")
|
218 |
+
gr.Markdown("This Application transforms your dreams into really cool pictures and makes it a more memorable experience.")
|
219 |
+
gr.Markdown("With this application you can save your dreams and share them with your friends and family.")
|
220 |
+
with gr.Row():
|
221 |
+
audio = gr.Audio(label="Record your dream here",source="microphone", type="filepath")
|
222 |
+
with gr.Row():
|
223 |
+
dropdown = gr.Dropdown(label="Whisper Model", choices=WhisperModels, value='base')
|
224 |
+
with gr.Row():
|
225 |
+
btn1 = gr.Button("Show me my dream!")
|
226 |
+
with gr.Column():
|
227 |
+
with gr.Row():
|
228 |
+
image1 = gr.Image(label="1", shape=(200,200))
|
229 |
+
text1= gr.Text(label="1")
|
230 |
+
image2 = gr.Image(label="2", shape=(200,200))
|
231 |
+
text2= gr.Text(label="2")
|
232 |
+
with gr.Row():
|
233 |
+
image3 = gr.Image(label="3", shape=(200,200))
|
234 |
+
text3= gr.Text(label="3")
|
235 |
+
image4 = gr.Image(label="4", shape=(200,200))
|
236 |
+
text4= gr.Text(label="4")
|
237 |
+
|
238 |
+
btn1.click(SpeechToText, inputs=[audio, dropdown], outputs=[image1, image2, image3, image4, text1, text2, text3, text4])
|
239 |
+
|
240 |
+
|
241 |
+
gr.Markdown("Made by the Dreamers [Alireza](https://github.com/golali) [Erfan](https://github.com/golchini) and [Omidreza](https://github.com/omidreza-amrollahi)")
|
242 |
+
|
243 |
+
demo.launch()
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
matplotlib==3.5.2
|
2 |
+
Pillow==9.2.0
|
3 |
+
git+https://github.com/openai/whisper.git
|
4 |
+
nltk==3.7
|
5 |
+
requests==2.28.1
|
6 |
+
streamlit==1.14
|
7 |
+
ffmpeg-python
|
8 |
+
pydub==0.25.1
|
9 |
+
setuptools-rust==1.5.2
|
10 |
+
openai
|
11 |
+
gradio==3.9.1
|