File size: 1,255 Bytes
7e27a4c
 
43ca502
84ee176
43ca502
 
 
 
 
 
 
 
 
7e27a4c
84ee176
 
24ed47c
7e27a4c
 
 
84ee176
 
7e27a4c
 
 
c6326c0
7e27a4c
 
 
 
 
252da83
 
 
 
 
 
 
7e27a4c
 
8b7b39f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import gradio as gr
from helper import load_image_from_url, render_results_in_image
from helper import summarize_predictions_natural_language
from transformers import pipeline

od_pipe = pipeline("object-detection", model="facebook/detr-resnet-50")

from transformers.utils import logging
logging.set_verbosity_error()

from helper import ignore_warnings
ignore_warnings()

tts_pipe = pipeline("text-to-speech",
                    model="kakao-enterprise/vits-ljs")

def get_pipeline_prediction(pil_image):
    
    pipeline_output = od_pipe(pil_image)
    text = summarize_predictions_natural_language(pipeline_output)
    narrated_text = tts_pipe(text)
    
    processed_image = render_results_in_image(pil_image,
                                            pipeline_output)
    return [processed_image, text]

demo = gr.Interface(
  fn=get_pipeline_prediction,
  inputs=gr.Image(label="Input image", 
                  type="pil"),
  # outputs=[gr.Image(label="Output image with predicted instances",
  #                  type="pil"), "audio"]
    outputs=[gr.Image(label="Output image with predicted instances",
                 type="pil"), 
             gr.Textbox(label="Transcription",
                  lines=3)]
    
)

demo.launch()