muhtasham commited on
Commit
0283c36
1 Parent(s): c2377c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -8
app.py CHANGED
@@ -5,14 +5,14 @@ import shortuuid
5
  from transformers import pipeline
6
 
7
  #input voice/text
8
- #input text to latent/dalle
9
- #do zero-shot classification of the output
10
- #tts your output looks like "label of zero-shot"
11
 
12
  asr = pipeline("automatic-speech-recognition")
13
  latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
14
- #zero = pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch32")
15
- #tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts")
16
 
17
  def text2image_latent(text, steps, width, height, images, diversity):
18
  print(text)
@@ -43,6 +43,9 @@ def speech_to_text(mic=None, file=None):
43
  transcription = asr(audio)["text"]
44
  return transcription
45
 
 
 
 
46
 
47
  with gr.Blocks() as demo:
48
  with gr.Row():
@@ -50,7 +53,7 @@ with gr.Blocks() as demo:
50
  audio_file =[
51
  gr.Audio(source="microphone", type="filepath", optional=True),
52
  gr.Audio(source="upload", type="filepath", optional=True)]
53
- text = gr.Textbox()
54
  with gr.Row():
55
  speech_to_text = gr.Button("Speech to text go brrr")
56
  with gr.Column():
@@ -59,11 +62,14 @@ with gr.Blocks() as demo:
59
  height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
60
  images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=4, step=1, minimum=1, maximum=4)
61
  diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
62
- with gr.Column():
63
- gallery = gr.Gallery(label="Individual images")
64
  with gr.Row():
65
  get_image_latent = gr.Button("Generate Image", css={"margin-top": "1em"})
 
 
 
66
 
 
67
  speech_to_text.click(speech_to_text, inputs=audio_file, outputs=text)
68
  get_image_latent.click(text2image_latent, inputs=[text,steps,width,height,images,diversity], outputs=gallery)
69
 
 
5
  from transformers import pipeline
6
 
7
  #input voice/text
8
+ #convert text to image via dalle
9
+ #given list of labels and a selected image from gallery do zero-shot classification
10
+ #tts your output label as: Your output looks like "label of zero-shot"
11
 
12
  asr = pipeline("automatic-speech-recognition")
13
  latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
14
+ zero = pipeline("zero-shot-image-classification")
15
+ tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts")
16
 
17
  def text2image_latent(text, steps, width, height, images, diversity):
18
  print(text)
 
43
  transcription = asr(audio)["text"]
44
  return transcription
45
 
46
+ #def zero_shot(image, labels_text):
47
+
48
+
49
 
50
  with gr.Blocks() as demo:
51
  with gr.Row():
 
53
  audio_file =[
54
  gr.Audio(source="microphone", type="filepath", optional=True),
55
  gr.Audio(source="upload", type="filepath", optional=True)]
56
+ text = gr.Textbox(default="If you dont want to record or upload your voice you can input text here")
57
  with gr.Row():
58
  speech_to_text = gr.Button("Speech to text go brrr")
59
  with gr.Column():
 
62
  height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
63
  images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=4, step=1, minimum=1, maximum=4)
64
  diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
65
+ gallery = gr.Gallery(label="Individual images", show_label=True)
 
66
  with gr.Row():
67
  get_image_latent = gr.Button("Generate Image", css={"margin-top": "1em"})
68
+ #with gr.Column():
69
+
70
+ #with gr.Row():
71
 
72
+
73
  speech_to_text.click(speech_to_text, inputs=audio_file, outputs=text)
74
  get_image_latent.click(text2image_latent, inputs=[text,steps,width,height,images,diversity], outputs=gallery)
75