File size: 1,351 Bytes
a1711e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
import torch

model_id = "stabilityai/stable-diffusion-2"

# Use the Euler scheduler here instead
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

def text_to_image(prompt):
  image = pipe(prompt).images[0]
  return image

from transformers import pipeline
import gradio as gr

# Indicamos el tipo de tarea para la que se estΓ‘ creando el pipeline (ASR)
model = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish")

def transcribe_audio(mic=None, file=None):
    if mic is not None:
        audio = mic
    elif file is not None:
        audio = file
    else:
        return "You must either provide a mic recording or a file"
    transcription = model(audio)["text"]
    image = text_to_image(transcription)
    return [transcription, image]

gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath", label="Speak here..."),
        gr.Audio(sources=["upload"], type="filepath", label="Upload file here..."),
    ],
    outputs=[gr.Textbox(label="Transcription"), gr.Image(label="Generated Image")],
).launch(debug=True)