Golali commited on
Commit
2470687
1 Parent(s): e63b09e

init commit

Browse files
Files changed (4) hide show
  1. README.md +4 -12
  2. app.py +243 -0
  3. packages.txt +1 -0
  4. requirements.txt +11 -0
README.md CHANGED
@@ -1,13 +1,5 @@
1
- ---
2
- title: Dreamsteam
3
- emoji: 📊
4
- colorFrom: indigo
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 3.27.0
8
- app_file: app.py
9
- pinned: false
10
- license: cc-by-nc-nd-4.0
11
- ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
+ # Speech To Pictogram
 
 
 
 
 
 
 
 
 
 
2
 
3
+
4
+ ## deployed to:
5
+ https://huggingface.co/spaces/omidreza/speechtopictogram
app.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import random
4
+ import whisper
5
+ import gradio as gr
6
+ WhisperModels = ['tiny', 'base', 'small', 'medium', 'large']
7
+ import matplotlib.pyplot as plt
8
+ import matplotlib
9
+ import requests
10
+ matplotlib.use('AGG')
11
+ import io
12
+ from PIL import Image
13
+ import PIL
14
+ from io import BytesIO
15
+ import openai
16
+ import os
17
+ openai.organization = os.getenv('organization')
18
+ openai.api_key = os.getenv('api_key')
19
+
20
+ def get_story(dream):
21
+ response = openai.Completion.create(
22
+ model="text-davinci-003",
23
+ prompt=f"m going to tell you of my dream and i want you to make a better more and more detailed story out of in one json array so i can create a booklet with image generation. Can you split it into 4 sections and give it 3 keys: section= nr of section, story= containing the story, alt_text= the alt text(make sure that the alt text is overall consistent and map each person in it to a known movie character):{dream}",
24
+ temperature=0.7,
25
+ max_tokens=2048,
26
+ top_p=1,
27
+ frequency_penalty=0,
28
+ presence_penalty=0
29
+ )
30
+ return response["choices"][0]["text"]
31
+
32
+ def get_image(text):
33
+ engine_id = "stable-diffusion-xl-beta-v2-2-2"
34
+ api_host = "https://api.stability.ai"
35
+ stability_key = os.getenv('stability_key')
36
+
37
+ if stability_key is None:
38
+ raise Exception("Missing Stability API key.")
39
+
40
+ response = requests.post(
41
+ f"{api_host}/v1/generation/{engine_id}/text-to-image",
42
+ headers={
43
+ "Content-Type": "application/json",
44
+ "Accept": "application/json",
45
+ "Authorization": f"Bearer {stability_key}"
46
+ },
47
+ json={
48
+ "text_prompts": [
49
+ {
50
+ "text": f"animated surreal with colors and creepy faces everything detailed, {text}"
51
+ }
52
+ ],
53
+ "cfg_scale": 25,
54
+ "clip_guidance_preset": "FAST_BLUE",
55
+ "height": 512,
56
+ "width": 512,
57
+ "samples": 1,
58
+ "steps": 50,
59
+ "seed": 4294967295,
60
+ },
61
+ )
62
+
63
+ if response.status_code != 200:
64
+ raise Exception("Non-200 response: " + str(response.text))
65
+
66
+ data = response.json()
67
+
68
+ #To change:
69
+ number = random.randint(0, 1000)
70
+
71
+ with open(f"{number}.png", "wb") as f:
72
+ f.write(base64.b64decode(data["artifacts"][0]["base64"]))
73
+
74
+ return f"{number}.png"
75
+
76
+ def get_array(dream):
77
+ json_start_index = dream.find("[")
78
+
79
+ # Extract the JSON-formatted string from the original string
80
+ json_string = dream[json_start_index:]
81
+
82
+ # Parse the JSON-formatted string and convert it to a Python object
83
+ my_object = json.loads(json_string)
84
+
85
+ # Extract the JSON array from the Python object
86
+ return my_object
87
+
88
+ def SpeechToText(audio, SelectedModel):
89
+ print('Loading model...')
90
+ model = whisper.load_model(SelectedModel)
91
+
92
+ print('Loading audio...')
93
+ audio = whisper.load_audio(audio)
94
+ audio = whisper.pad_or_trim(audio)
95
+
96
+ print('Creating log-mel spectrogram...')
97
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
98
+
99
+ print('Detecting language...')
100
+ _, probs = model.detect_language(mel)
101
+ lang = f"Language: {max(probs, key=probs.get)}"
102
+
103
+ print('Decoding audio to text...')
104
+ options = whisper.DecodingOptions(fp16 = False)
105
+ result = whisper.decode(model, mel, options)
106
+
107
+ text = get_story(result.text)
108
+ print("Text: " + text)
109
+ text = get_array(text)
110
+ print(type(text))
111
+
112
+ img1 = get_image(text[0]["alt_text"])
113
+ text1 = text[0]["story"]
114
+ print('image added')
115
+ img2 = get_image(text[1]["alt_text"])
116
+ text2 = text[1]["story"]
117
+ print('image added')
118
+ img3 = get_image(text[2]["alt_text"])
119
+ text3 = text[2]["story"]
120
+ print('image added')
121
+ img4 = get_image(text[3]["alt_text"])
122
+ text4 = text[3]["story"]
123
+ print('image added')
124
+ #carou = [ "./585.png"]
125
+ #text = "this is a test"
126
+
127
+ return img1, img2, img3, img4, text1, text2, text3, text4
128
+
129
+ def clean_text(text):
130
+ """
131
+ we get rid of the commas and dots, maybe in the future there more things to get rid of in a sentence like !, ? ...
132
+ Args:
133
+ text (_type_): _description_
134
+ Returns:
135
+ _type_: _description_
136
+ """
137
+ print("cleaning text: ", text)
138
+ text = text.lower()
139
+ text = text.replace(",", " ")
140
+ text = text.replace(".", " ")
141
+ text = text.replace("?", " ")
142
+ text = text.replace("-", " ")
143
+ text = text.split()
144
+ new_string = []
145
+ for temp in text:
146
+ if temp:
147
+ if temp == "i":
148
+ temp = "I"
149
+ new_string.append(temp)
150
+ concatString = ' '.join(new_string)
151
+ return new_string, concatString
152
+
153
+ import nltk
154
+ nltk.download('punkt')
155
+ nltk.download('averaged_perceptron_tagger')
156
+ nltk.download('wordnet')
157
+ nltk.download('omw-1.4')
158
+ nltk.data.path.append('/root/nltk_data')
159
+ from nltk import pos_tag, word_tokenize
160
+ from nltk.stem.wordnet import WordNetLemmatizer
161
+
162
+ class POS_tagging():
163
+ def __init__(self, concatString):
164
+ self.concatString = concatString
165
+ def handle_conjugation(self, tags):
166
+ # here we do the conjugation for verbs
167
+ new_sentence = []
168
+ for index, item in enumerate(tags):
169
+ if item[1] not in ['VBP', 'DT', 'IN', 'TO', 'VBG', 'VBD', 'VBN', 'VBZ']:
170
+ new_sentence.append(item[0])
171
+ elif item[1] in ['VBP', 'VBG', 'VBD', 'VBN', 'VBZ']:
172
+ new_verb = WordNetLemmatizer().lemmatize(item[0],'v')
173
+ if new_verb != "be":
174
+ new_sentence.append(new_verb)
175
+ return new_sentence
176
+ def make_predictions(self):
177
+ tags = pos_tag(word_tokenize(self.concatString))
178
+ return self.handle_conjugation(tags)
179
+
180
+ def generate_pic(text_to_search, ax):
181
+
182
+
183
+ """
184
+ we define a function here to use the api frpm arasaac, and return the image based on the text that we search
185
+ ref: https://arasaac.org/developers/api
186
+ Args:
187
+ text_to_search (_type_): _description_
188
+ ax (_type_): _description_
189
+ """
190
+ search_url = f"https://api.arasaac.org/api/pictograms/en/bestsearch/{text_to_search}"
191
+ search_response = requests.get(search_url)
192
+ search_json = search_response.json()
193
+ if search_json:
194
+ pic_url = f"https://api.arasaac.org/api/pictograms/{search_json[0]['_id']}?download=false"
195
+ pic_response = requests.get(pic_url)
196
+ img = Image.open(BytesIO(pic_response.content))
197
+ ax.imshow(img)
198
+ ax.set_title(text_to_search)
199
+ else:
200
+ try:
201
+ response = openai.Image.create(
202
+ prompt=text_to_search,
203
+ n=2,
204
+ size="512x512"
205
+ )
206
+ image_url = response['data'][0]['url']
207
+ image_response = requests.get(image_url)
208
+ img = Image.open(BytesIO(image_response.content))
209
+ ax.imshow(img)
210
+ ax.set_title(f"/{text_to_search}/")
211
+ except:
212
+ ax.set_title("Error!")
213
+ ax.axes.xaxis.set_visible(False)
214
+ ax.axes.yaxis.set_visible(False)
215
+
216
+ with gr.Blocks(title="The Dream Steamer") as demo:
217
+ gr.Markdown("# The Dream Steamer")
218
+ gr.Markdown("This Application transforms your dreams into really cool pictures and makes it a more memorable experience.")
219
+ gr.Markdown("With this application you can save your dreams and share them with your friends and family.")
220
+ with gr.Row():
221
+ audio = gr.Audio(label="Record your dream here",source="microphone", type="filepath")
222
+ with gr.Row():
223
+ dropdown = gr.Dropdown(label="Whisper Model", choices=WhisperModels, value='base')
224
+ with gr.Row():
225
+ btn1 = gr.Button("Show me my dream!")
226
+ with gr.Column():
227
+ with gr.Row():
228
+ image1 = gr.Image(label="1", shape=(200,200))
229
+ text1= gr.Text(label="1")
230
+ image2 = gr.Image(label="2", shape=(200,200))
231
+ text2= gr.Text(label="2")
232
+ with gr.Row():
233
+ image3 = gr.Image(label="3", shape=(200,200))
234
+ text3= gr.Text(label="3")
235
+ image4 = gr.Image(label="4", shape=(200,200))
236
+ text4= gr.Text(label="4")
237
+
238
+ btn1.click(SpeechToText, inputs=[audio, dropdown], outputs=[image1, image2, image3, image4, text1, text2, text3, text4])
239
+
240
+
241
+ gr.Markdown("Made by the Dreamers [Alireza](https://github.com/golali) [Erfan](https://github.com/golchini) and [Omidreza](https://github.com/omidreza-amrollahi)")
242
+
243
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ matplotlib==3.5.2
2
+ Pillow==9.2.0
3
+ git+https://github.com/openai/whisper.git
4
+ nltk==3.7
5
+ requests==2.28.1
6
+ streamlit==1.14
7
+ ffmpeg-python
8
+ pydub==0.25.1
9
+ setuptools-rust==1.5.2
10
+ openai
11
+ gradio==3.9.1