Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
from helper import load_image_from_url, render_results_in_image | |
from helper import summarize_predictions_natural_language | |
od_pipe = pipeline("object-detection", model="facebook/detr-resnet-50") | |
tts_pipe = pipeline("text-to-speech", | |
model="kakao-enterprise/vits-ljs") | |
def get_pipeline_prediction(pil_image): | |
pipeline_output = od_pipe(pil_image) | |
text = summarize_predictions_natural_language(pipeline_output) | |
#text = "Hello, my name is Ratha" | |
gen_audio = tts_pipe(text) | |
processed_image = render_results_in_image(pil_image, | |
pipeline_output) | |
rate= gen_audio["sampling_rate"] | |
return processed_image, text, (rate, gen_audio["audio"][0]) | |
demo = gr.Interface( | |
fn=get_pipeline_prediction, | |
inputs=gr.Image(label="Input image", | |
type="pil"), | |
outputs= [ | |
gr.Image(label="Output image with predicted instances", type="pil"), | |
gr.Textbox(label="Prediction Summary"), | |
gr.Audio(label="Generated Speech")] | |
) | |
demo.launch() | |
#text = itt_pipe(input) | |
#tts_pipe = pipeline("text-to-speech", | |
# model="kakao-enterprise/vits-ljs") | |
#narrated_text = tts_pipe(tts_pipe[0]['generated_text']) | |
#def launch(text): | |
# out = tts_pipe(text) | |
# audio = IPythonAudio(out["audio"][0], | |
# rate=out["sampling_rate"]) | |
# return audio | |
#iface = gr.Interface(launch, | |
# inputs=gr.Image(type='pil'), | |
# outputs="text") | |
#iface.launch() | |