Spaces:
Runtime error
Runtime error
import io, os, base64 | |
from PIL import Image | |
import gradio as gr | |
import shortuuid | |
import numpy as np | |
from transformers import pipeline | |
asr = pipeline("automatic-speech-recognition") | |
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion") | |
zero = pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch32") | |
#zero = gr.Interface.load("spaces/Datatrooper/zero-shot-image-classification") | |
#tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts") | |
def text2image_latent(text, steps, width, height, images, diversity): | |
print(text) | |
results = latent(text, steps, width, height, images, diversity) | |
image_paths = [] | |
for image in results[1]: | |
image_str = image[0] | |
image_str = image_str.replace("data:image/png;base64,","") | |
decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8")) | |
img = Image.open(io.BytesIO(decoded_bytes)) | |
url = shortuuid.uuid() | |
temp_dir = './tmp' | |
if not os.path.exists(temp_dir): | |
os.makedirs(temp_dir, exist_ok=True) | |
image_path = f'{temp_dir}/{url}.png' | |
img.save(f'{temp_dir}/{url}.png') | |
image_paths.append(image_path) | |
return(image_paths) | |
def speech_to_text(mic=None, file=None): | |
if mic is not None: | |
audio = mic | |
elif file is not None: | |
audio = file | |
else: | |
return "You must either provide a mic recording or a file" | |
transcription = asr(audio)["text"] | |
return transcription | |
def zero_shot(image, text_input): | |
PIL_image = Image.fromarray(np.uint8(image)).convert('RGB') | |
labels = labels_text.split(",") | |
res = pipe(images=PIL_image, | |
candidate_labels=labels, | |
hypothesis_template= "This is a photo of a {}") | |
return {dic["label"]: dic["score"] for dic in res} | |
def shot(image, labels_text): | |
PIL_image = Image.fromarray(np.uint8(image)).convert('RGB') | |
labels = labels_text.split(",") | |
res = pipe(images=PIL_image, | |
candidate_labels=labels, | |
hypothesis_template= "This is a photo of a {}") | |
return {dic["label"]: dic["score"] for dic in res} | |
with gr.Blocks() as demo: | |
gr.Markdown( """ | |
- Input voice/text | |
- Convert voice/text to image via Latent Diffusion | |
- Given list of labels and a selected image from gallery do zero-shot classification | |
- Coming soon: TTS(audio) your output label as: Your output looks like "label of zero-shot" | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
audio_file =[ | |
gr.Audio(source="microphone", type="filepath", optional=True), | |
gr.Audio(source="upload", type="filepath", optional=True)] | |
text = gr.Textbox(placeholder="If you dont want to record or upload your voice you can input text here") | |
with gr.Row(): | |
speech_to_text = gr.Button("Speech to text go brrr", css={"margin-top": "1em"}) | |
with gr.Column(): | |
steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=50,maximum=50,minimum=1,step=1) | |
width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32) | |
height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32) | |
images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4) | |
diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0) | |
#gallery = [gr.outputs.Image(type="pil"),gr.outputs.Textbox(label="Error")] | |
gallery = gr.Gallery(label="Individual images") | |
with gr.Row(): | |
get_image_latent = gr.Button("Generate Image go brr") | |
with gr.Column(): | |
text_input = gr.Textbox(placeholder="input a list of labels separated by commas") | |
label = gr.Label() | |
with gr.Row(): | |
zero_shot_clf = gr.Button("Classify Image go brr") | |
speech_to_text.click(speech_to_text, inputs=audio_file, outputs=text) | |
get_image_latent.click(text2image_latent, inputs=[text, steps, width, height, images, diversity], outputs=gallery) | |
zero_shot_clf.click(zero_shot, inputs=[gallery, text_input], outputs=label) | |
demo.launch() |