Spaces:

rathapech
/

object-detection

Sleeping

File size: 1,591 Bytes

42ef26d
 
92615bf
 
42ef26d
722f574
 
 
42ef26d
50a2f63
 
722f574
 
 
 
50a2f63
 
722f574
 
 
 
 
 
 
 
 
 
 
c88106c
722f574
 
c88106c
ac026ae
 
5ba51aa
 
42ef26d
ac026ae
5ba51aa
ac026ae
5ba51aa
 
 
 
 
ac026ae
5ba51aa
c714d05
 
42ef26d
2f90508
c714d05
2f90508

import gradio as gr
from transformers import pipeline
from helper import load_image_from_url, render_results_in_image
from helper import summarize_predictions_natural_language

od_pipe = pipeline("object-detection", model="facebook/detr-resnet-50")
tts_pipe = pipeline("text-to-speech",
                    model="kakao-enterprise/vits-ljs")

def get_pipeline_prediction(pil_image):
    
    pipeline_output = od_pipe(pil_image)
    text = summarize_predictions_natural_language(pipeline_output)
    #text = "Hello, my name is Ratha"
    gen_audio = tts_pipe(text)
    processed_image = render_results_in_image(pil_image,
                                            pipeline_output)
    rate= gen_audio["sampling_rate"]
    return processed_image, text, (rate, gen_audio["audio"][0])

demo = gr.Interface(
  fn=get_pipeline_prediction,
  inputs=gr.Image(label="Input image", 
                  type="pil"),
  outputs= [
          gr.Image(label="Output image with predicted instances", type="pil"),
          gr.Textbox(label="Prediction Summary"),
          gr.Audio(label="Generated Speech")]
)

demo.launch()
#text = itt_pipe(input)


#tts_pipe = pipeline("text-to-speech",
#                    model="kakao-enterprise/vits-ljs")


#narrated_text = tts_pipe(tts_pipe[0]['generated_text'])

#def launch(text):
#    out = tts_pipe(text)
#    audio = IPythonAudio(out["audio"][0],
#             rate=out["sampling_rate"])
#    return audio
    
#iface = gr.Interface(launch,
#                     inputs=gr.Image(type='pil'),
#                     outputs="text")

    
#iface.launch()