import os
import gradio as gr
from helper import load_image_from_url, render_results_in_image
from helper import summarize_predictions_natural_language
from transformers import pipeline

od_pipe = pipeline("object-detection", model="facebook/detr-resnet-50")

from transformers.utils import logging
logging.set_verbosity_error()

from helper import ignore_warnings
ignore_warnings()

tts_pipe = pipeline("text-to-speech",
                    model="kakao-enterprise/vits-ljs")

def get_pipeline_prediction(pil_image):
    
    pipeline_output = od_pipe(pil_image)
    text = summarize_predictions_natural_language(pipeline_output)
    narrated_text = tts_pipe(text)
    
    processed_image = render_results_in_image(pil_image,
                                            pipeline_output)
    return [processed_image, text]

demo = gr.Interface(
  fn=get_pipeline_prediction,
  inputs=gr.Image(label="Input image", 
                  type="pil"),
  # outputs=[gr.Image(label="Output image with predicted instances",
  #                  type="pil"), "audio"]
    outputs=[gr.Image(label="Output image with predicted instances",
                 type="pil"), 
             gr.Textbox(label="Transcription",
                  lines=3)]
    
)

demo.launch()