Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io, os, base64
|
2 |
+
from PIL import Image
|
3 |
+
import gradio as gr
|
4 |
+
import shortuuid
|
5 |
+
from transformers import pipeline
|
6 |
+
|
7 |
+
#input voice/text
|
8 |
+
#input text to latent/dalle
|
9 |
+
#do zero-shot classification of the output
|
10 |
+
#tts your output looks like "label of zero-shot"
|
11 |
+
|
12 |
+
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
|
13 |
+
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
|
14 |
+
zero = pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch32")
|
15 |
+
tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts")
|
16 |
+
|
17 |
+
def text2image_latent(text, steps, width, height, images, diversity):
|
18 |
+
print(text)
|
19 |
+
results = latent(text, steps, width, height, images, diversity)
|
20 |
+
image_paths = []
|
21 |
+
for image in results[1]:
|
22 |
+
image_str = image[0]
|
23 |
+
image_str = image_str.replace("data:image/png;base64,","")
|
24 |
+
decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
|
25 |
+
img = Image.open(io.BytesIO(decoded_bytes))
|
26 |
+
url = shortuuid.uuid()
|
27 |
+
temp_dir = './tmp'
|
28 |
+
if not os.path.exists(temp_dir):
|
29 |
+
os.makedirs(temp_dir, exist_ok=True)
|
30 |
+
image_path = f'{temp_dir}/{url}.png'
|
31 |
+
img.save(f'{temp_dir}/{url}.png')
|
32 |
+
image_paths.append(image_path)
|
33 |
+
return(image_paths)
|
34 |
+
|
35 |
+
|
36 |
+
def speech_to_text(mic=None, file=None):
|
37 |
+
if mic is not None:
|
38 |
+
audio = mic
|
39 |
+
elif file is not None:
|
40 |
+
audio = file
|
41 |
+
else:
|
42 |
+
return "You must either provide a mic recording or a file"
|
43 |
+
transcription = asr(audio)["text"]
|
44 |
+
return transcription
|
45 |
+
|
46 |
+
|
47 |
+
with gr.Blocks() as demo:
|
48 |
+
with gr.Row():
|
49 |
+
with gr.Column():
|
50 |
+
audio_file =[
|
51 |
+
gr.Audio(source="microphone", type="filepath", optional=True),
|
52 |
+
gr.Audio(source="upload", type="filepath", optional=True)]
|
53 |
+
text = gr.Textbox()
|
54 |
+
with gr.Row():
|
55 |
+
speech_to_text = gr.Button("Speech to text go brrr")
|
56 |
+
with gr.Column():
|
57 |
+
steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=50,maximum=50,minimum=1,step=1)
|
58 |
+
width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
|
59 |
+
height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
|
60 |
+
images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=4, step=1, minimum=1, maximum=4)
|
61 |
+
diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
|
62 |
+
with gr.Column():
|
63 |
+
gallery = gr.Gallery(label="Individual images")
|
64 |
+
with gr.Row():
|
65 |
+
get_image_latent = gr.Button("Generate Image", css={"margin-top": "1em"})
|
66 |
+
|
67 |
+
speech_to_text.click(speech_to_text, inputs=audio_file, outputs=text)
|
68 |
+
get_image_latent.click(text2image_latent, inputs=[text,steps,width,height,images,diversity], outputs=gallery)
|
69 |
+
|
70 |
+
|
71 |
+
demo.launch(enable_queue=False)
|