muhtasham commited on
Commit
2adde1f
β€’
1 Parent(s): affaf34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -34
app.py CHANGED
@@ -7,7 +7,7 @@ from transformers import pipeline
7
 
8
  asr = pipeline("automatic-speech-recognition")
9
  latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
10
- zero = pipeline("zero-shot-image-classification")
11
  #zero = gr.Interface.load("spaces/Datatrooper/zero-shot-image-classification")
12
  #tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts")
13
 
@@ -38,59 +38,34 @@ def speech_to_text(mic=None, file=None, state=""):
38
  else:
39
  return "You must either provide a mic recording or a file"
40
  transcription = asr(audio)["text"]
41
- #state += text + " "
42
  return state
43
 
44
- def zero_shot(image, text_input):
45
- PIL_image = Image.fromarray(np.uint8(image)).convert('RGB')
46
- labels = labels_text.split(",")
47
- res = pipe(images=PIL_image,
48
- candidate_labels=labels,
49
- hypothesis_template= "This is a photo of a {}")
50
- return {dic["label"]: dic["score"] for dic in res}
51
-
52
- def shot(image, labels_text):
53
- PIL_image = Image.fromarray(np.uint8(image)).convert('RGB')
54
- labels = labels_text.split(",")
55
- res = pipe(images= PIL_image,
56
- candidate_labels=labels,
57
- hypothesis_template= "This is a photo of a {}")
58
- return {dic["label"]: dic["score"] for dic in res}
59
 
60
  with gr.Blocks() as demo:
61
  gr.Markdown( """
62
- - 🎀 Input voice/text
63
- - ✨ Convert voice/text to image via Latent Diffusion
64
- - πŸ€– Given list of labels and a selected image from gallery do zero-shot classification
65
- - πŸŽ›οΈ Coming soon: TTS(audio) your output label as: Your output looks like "label of zero-shot"
66
  """)
67
  with gr.Row():
68
  with gr.Column():
69
  audio_file =[
70
- gr.Audio(source="microphone", type="filepath", optional=True),
71
- gr.Audio(source="upload", type="filepath", optional=True)]
72
- text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
73
  with gr.Row():
74
- speech_to_text = gr.Button("Speech to text go brrr", css={"margin-top": "1em"})
75
  with gr.Column():
76
- steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=50,maximum=50,minimum=1,step=1)
77
  width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
78
  height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
79
  images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
80
  diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
81
- #gallery = [gr.outputs.Image(type="pil"),gr.outputs.Textbox(label="Error")]
82
  gallery = gr.Gallery(label="Individual images")
83
  with gr.Row():
84
  get_image_latent = gr.Button("Generate Image go brr")
85
- with gr.Column():
86
- text_input = gr.Textbox(label="Candidate labels", placeholder="input a list of labels separated by commas")
87
- label = gr.Label()
88
- with gr.Row():
89
- zero_shot_clf = gr.Button("Classify Image go brr")
90
-
91
 
92
  speech_to_text.click(speech_to_text, inputs=audio_file, outputs=text)
93
  get_image_latent.click(text2image_latent, inputs=[text, steps, width, height, images, diversity], outputs=gallery)
94
- zero_shot_clf.click(zero_shot, inputs=[gallery, text_input], outputs=label)
95
 
96
  demo.launch()
 
7
 
8
  asr = pipeline("automatic-speech-recognition")
9
  latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
10
+ #zero = pipeline("zero-shot-image-classification")
11
  #zero = gr.Interface.load("spaces/Datatrooper/zero-shot-image-classification")
12
  #tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts")
13
 
 
38
  else:
39
  return "You must either provide a mic recording or a file"
40
  transcription = asr(audio)["text"]
 
41
  return state
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  with gr.Blocks() as demo:
45
  gr.Markdown( """
46
+ # 🎀 Sing or tell your story and let this Space ✨ visualize your story along
47
+ - Soon to be added
48
+ - Near real time(streaming option)
49
+ - Also allow playback of you audio relayed with video
50
  """)
51
  with gr.Row():
52
  with gr.Column():
53
  audio_file =[
54
+ gr.Audio(source="microphone", type="filepath")]
55
+ speech_to_text = gr.Button("Speech to text go brrr")
 
56
  with gr.Row():
57
+ text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
58
  with gr.Column():
59
+ steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=1,maximum=50,minimum=1,step=1)
60
  width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
61
  height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
62
  images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
63
  diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
 
64
  gallery = gr.Gallery(label="Individual images")
65
  with gr.Row():
66
  get_image_latent = gr.Button("Generate Image go brr")
 
 
 
 
 
 
67
 
68
  speech_to_text.click(speech_to_text, inputs=audio_file, outputs=text)
69
  get_image_latent.click(text2image_latent, inputs=[text, steps, width, height, images, diversity], outputs=gallery)
 
70
 
71
  demo.launch()