magnificento / app.py
muhtasham's picture
Update app.py
5a9e78d
raw
history blame
3.74 kB
import io, os, base64
from PIL import Image
import gradio as gr
import shortuuid
from transformers import pipeline
asr = pipeline("automatic-speech-recognition")
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
zero = pipeline("zero-shot-image-classification")
#tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts")
def text2image_latent(text, steps, width, height, images, diversity):
print(text)
results = latent(text, steps, width, height, images, diversity)
image_paths = []
for image in results[1]:
image_str = image[0]
image_str = image_str.replace("data:image/png;base64,","")
decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
img = Image.open(io.BytesIO(decoded_bytes))
url = shortuuid.uuid()
temp_dir = './tmp'
if not os.path.exists(temp_dir):
os.makedirs(temp_dir, exist_ok=True)
image_path = f'{temp_dir}/{url}.png'
img.save(f'{temp_dir}/{url}.png')
image_paths.append(image_path)
return(image_paths)
def speech_to_text(mic=None, file=None):
if mic is not None:
audio = mic
elif file is not None:
audio = file
else:
return "You must either provide a mic recording or a file"
transcription = asr(audio)["text"]
return transcription
def zero_shot(image, text_input):
results = zero(image, text_input)
return {dic["label"]: dic["score"] for dic in results}
with gr.Blocks() as demo:
gr.Markdown( """
#input voice/text
#convert text to image via dalle
#given list of labels and a selected image from gallery do zero-shot classification
#Coming soon: tts your output label as: Your output looks like "label of zero-shot"
""")
with gr.Row():
with gr.Column():
audio_file =[
gr.Audio(source="microphone", type="filepath", optional=True),
gr.Audio(source="upload", type="filepath", optional=True)]
text = gr.Textbox(placeholder="If you dont want to record or upload your voice you can input text here")
with gr.Row():
speech_to_text = gr.Button("Speech to text go brrr", css={"margin-top": "1em"})
with gr.Column():
steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=50,maximum=50,minimum=1,step=1)
width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
#gallery = [gr.outputs.Image(type="pil"),gr.outputs.Textbox(label="Error")]
gallery = gr.Gallery(label="Individual images")
with gr.Row():
get_image_latent = gr.Button("Generate Image go brr")
with gr.Column():
text_input = gr.Textbox(placeholder="input a list of labels separated by commas")
label = gr.Label()
with gr.Row():
zero_shot_clf = gr.Button("Classify Image go brr")
speech_to_text.click(speech_to_text, inputs=audio_file, outputs=text)
get_image_latent.click(text2image_latent, inputs=[text,steps,width,height,images,diversity], outputs=gallery)
zero_shot_clf.click(zero_shot, inputs=[gallery,text_input], outputs=label)
demo.launch(enable_queue=False)