import gradio as gr
import torch
from ultralyticsplus import YOLO
import numpy as np

from PIL import Image
from base64 import b64encode
from io import BytesIO
from gtts import gTTS
from mtranslate import translate
from speech_recognition import AudioFile, Recognizer
import time

from sahi.prediction import ObjectPrediction, PredictionScore
from sahi.utils.cv import (
    get_bool_mask_from_coco_segmentation,
    read_image_as_pil,
    visualize_object_predictions,
)

model = YOLO('ultralyticsplus/yolov8s')
CLASS = model.model.names

def tts(text: str, language="ja") -> object:
    """Converts text into autoplay html.
    Args:
        text (str): generated answer of bot
    Returns:
        html: autoplay object
    """
    tts_object = gTTS(text=text, lang=language, slow=False)
    bytes_object = BytesIO()
    tts_object.write_to_fp(bytes_object)
    bytes_object.seek(0)
    b64 = b64encode(bytes_object.getvalue()).decode()
    html = f"""
    <audio controls autoplay>
    <source src="data:audio/wav;base64,{b64}" type="audio/wav">
    </audio>
    """
    return html


def yolov8_inference(
    image,
    area_thres=0.35,
    defaul_bot_voice="おはいようございます"
):
    """
    YOLOv8 inference function
    Args:
        image: Input image
    Returns:
        Rendered image
    """
    # time.sleep(1)
    # set model parameters
    model.overrides['conf'] = 0.25  # NMS confidence threshold
    model.overrides['iou'] = 0.45  # NMS IoU threshold
    model.overrides['agnostic_nms'] = False  # NMS class-agnostic
    model.overrides['max_det'] = 1000  # maximum number of detections per image
    results = model.predict(image, show=False)[0]
    image = read_image_as_pil(image)
    np_image = np.ascontiguousarray(image)
    masks, boxes = results.masks, results.boxes
    area_image = image.width*image.height
    object_predictions = []
    html_bot_voice = ""
    most_close = 0
    if boxes is not None:
        det_ind = 0
        for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
            if int(cls) != 0:
                continue
            box = xyxy.tolist()
            area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
            object_prediction = ObjectPrediction(
                bbox=box,
                category_name=CLASS[int(cls)],
                category_id=int(cls),
                score=area_rate,
            )
            object_predictions.append(object_prediction)
            det_ind += 1
            if area_rate >= most_close:
                out_img = image.crop(tuple(box)).resize((image.width, image.height))
                most_close = area_rate
                if area_rate >= area_thres:
                    html_bot_voice = tts(defaul_bot_voice, language="ja")
                
    # result = visualize_object_predictions(
    #     image=np_image,
    #     object_prediction_list=object_predictions,
    #     rect_th=2,
    #     text_th=2,
    # )
    # return Image.fromarray(result["image"]), html_bot_voice
    return out_img, html_bot_voice


outputs = [gr.Image(type="filepath", label="Robot View"),
           gr.HTML()]
title = "Detomo Aisatsu Robot"

demo_app = gr.Interface(
    fn=yolov8_inference,
    inputs=gr.Image(source="webcam", streaming=True, label="Input Image"),
    outputs=outputs,
    title=title,
    live=True,
)
demo_app.launch(debug=True)