Spaces:
Runtime error
Runtime error
File size: 3,736 Bytes
ed93d68 c2377c4 ed93d68 ac022fd 974534b ed93d68 ac022fd 4581829 0283c36 5a9e78d ed93d68 a16eddc ac022fd ed93d68 ac022fd ed93d68 f1bb426 ed93d68 08b33e5 ed93d68 4581829 ac022fd 4581829 ed93d68 0283c36 ed93d68 08b33e5 4a1b464 ed93d68 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import io, os, base64
from PIL import Image
import gradio as gr
import shortuuid
from transformers import pipeline
asr = pipeline("automatic-speech-recognition")
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
zero = pipeline("zero-shot-image-classification")
#tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts")
def text2image_latent(text, steps, width, height, images, diversity):
print(text)
results = latent(text, steps, width, height, images, diversity)
image_paths = []
for image in results[1]:
image_str = image[0]
image_str = image_str.replace("data:image/png;base64,","")
decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
img = Image.open(io.BytesIO(decoded_bytes))
url = shortuuid.uuid()
temp_dir = './tmp'
if not os.path.exists(temp_dir):
os.makedirs(temp_dir, exist_ok=True)
image_path = f'{temp_dir}/{url}.png'
img.save(f'{temp_dir}/{url}.png')
image_paths.append(image_path)
return(image_paths)
def speech_to_text(mic=None, file=None):
if mic is not None:
audio = mic
elif file is not None:
audio = file
else:
return "You must either provide a mic recording or a file"
transcription = asr(audio)["text"]
return transcription
def zero_shot(image, text_input):
results = zero(image, text_input)
return {dic["label"]: dic["score"] for dic in results}
with gr.Blocks() as demo:
gr.Markdown( """
#input voice/text
#convert text to image via dalle
#given list of labels and a selected image from gallery do zero-shot classification
#Coming soon: tts your output label as: Your output looks like "label of zero-shot"
""")
with gr.Row():
with gr.Column():
audio_file =[
gr.Audio(source="microphone", type="filepath", optional=True),
gr.Audio(source="upload", type="filepath", optional=True)]
text = gr.Textbox(placeholder="If you dont want to record or upload your voice you can input text here")
with gr.Row():
speech_to_text = gr.Button("Speech to text go brrr", css={"margin-top": "1em"})
with gr.Column():
steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=50,maximum=50,minimum=1,step=1)
width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
#gallery = [gr.outputs.Image(type="pil"),gr.outputs.Textbox(label="Error")]
gallery = gr.Gallery(label="Individual images")
with gr.Row():
get_image_latent = gr.Button("Generate Image go brr")
with gr.Column():
text_input = gr.Textbox(placeholder="input a list of labels separated by commas")
label = gr.Label()
with gr.Row():
zero_shot_clf = gr.Button("Classify Image go brr")
speech_to_text.click(speech_to_text, inputs=audio_file, outputs=text)
get_image_latent.click(text2image_latent, inputs=[text,steps,width,height,images,diversity], outputs=gallery)
zero_shot_clf.click(zero_shot, inputs=[gallery,text_input], outputs=label)
demo.launch(enable_queue=False) |