muhtasham commited on
Commit
ed93d68
1 Parent(s): b80c362

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io, os, base64
2
+ from PIL import Image
3
+ import gradio as gr
4
+ import shortuuid
5
+ from transformers import pipeline
6
+
7
+ #input voice/text
8
+ #input text to latent/dalle
9
+ #do zero-shot classification of the output
10
+ #tts your output looks like "label of zero-shot"
11
+
12
+ asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
13
+ latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
14
+ zero = pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch32")
15
+ tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts")
16
+
17
+ def text2image_latent(text, steps, width, height, images, diversity):
18
+ print(text)
19
+ results = latent(text, steps, width, height, images, diversity)
20
+ image_paths = []
21
+ for image in results[1]:
22
+ image_str = image[0]
23
+ image_str = image_str.replace("data:image/png;base64,","")
24
+ decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
25
+ img = Image.open(io.BytesIO(decoded_bytes))
26
+ url = shortuuid.uuid()
27
+ temp_dir = './tmp'
28
+ if not os.path.exists(temp_dir):
29
+ os.makedirs(temp_dir, exist_ok=True)
30
+ image_path = f'{temp_dir}/{url}.png'
31
+ img.save(f'{temp_dir}/{url}.png')
32
+ image_paths.append(image_path)
33
+ return(image_paths)
34
+
35
+
36
+ def speech_to_text(mic=None, file=None):
37
+ if mic is not None:
38
+ audio = mic
39
+ elif file is not None:
40
+ audio = file
41
+ else:
42
+ return "You must either provide a mic recording or a file"
43
+ transcription = asr(audio)["text"]
44
+ return transcription
45
+
46
+
47
+ with gr.Blocks() as demo:
48
+ with gr.Row():
49
+ with gr.Column():
50
+ audio_file =[
51
+ gr.Audio(source="microphone", type="filepath", optional=True),
52
+ gr.Audio(source="upload", type="filepath", optional=True)]
53
+ text = gr.Textbox()
54
+ with gr.Row():
55
+ speech_to_text = gr.Button("Speech to text go brrr")
56
+ with gr.Column():
57
+ steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=50,maximum=50,minimum=1,step=1)
58
+ width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
59
+ height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
60
+ images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=4, step=1, minimum=1, maximum=4)
61
+ diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
62
+ with gr.Column():
63
+ gallery = gr.Gallery(label="Individual images")
64
+ with gr.Row():
65
+ get_image_latent = gr.Button("Generate Image", css={"margin-top": "1em"})
66
+
67
+ speech_to_text.click(speech_to_text, inputs=audio_file, outputs=text)
68
+ get_image_latent.click(text2image_latent, inputs=[text,steps,width,height,images,diversity], outputs=gallery)
69
+
70
+
71
+ demo.launch(enable_queue=False)