Aisatsu-robot / app.py
vumichien's picture
Update app.py
b135bd1
import gradio as gr
import torch
from ultralyticsplus import YOLO
import numpy as np
from PIL import Image
from base64 import b64encode
from io import BytesIO
from gtts import gTTS
from mtranslate import translate
from speech_recognition import AudioFile, Recognizer
import time
from sahi.prediction import ObjectPrediction, PredictionScore
from sahi.utils.cv import (
get_bool_mask_from_coco_segmentation,
read_image_as_pil,
visualize_object_predictions,
)
model = YOLO('ultralyticsplus/yolov8s')
CLASS = model.model.names
def tts(text: str, language="ja") -> object:
"""Converts text into autoplay html.
Args:
text (str): generated answer of bot
Returns:
html: autoplay object
"""
tts_object = gTTS(text=text, lang=language, slow=False)
bytes_object = BytesIO()
tts_object.write_to_fp(bytes_object)
bytes_object.seek(0)
b64 = b64encode(bytes_object.getvalue()).decode()
html = f"""
<audio controls autoplay>
<source src="data:audio/wav;base64,{b64}" type="audio/wav">
</audio>
"""
return html
def yolov8_inference(
image,
area_thres=0.35,
defaul_bot_voice="おはいようございます"
):
"""
YOLOv8 inference function
Args:
image: Input image
Returns:
Rendered image
"""
# time.sleep(1)
# set model parameters
model.overrides['conf'] = 0.25 # NMS confidence threshold
model.overrides['iou'] = 0.45 # NMS IoU threshold
model.overrides['agnostic_nms'] = False # NMS class-agnostic
model.overrides['max_det'] = 1000 # maximum number of detections per image
results = model.predict(image, show=False)[0]
image = read_image_as_pil(image)
np_image = np.ascontiguousarray(image)
masks, boxes = results.masks, results.boxes
area_image = image.width*image.height
object_predictions = []
html_bot_voice = ""
most_close = 0
if boxes is not None:
det_ind = 0
for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
if int(cls) != 0:
continue
box = xyxy.tolist()
area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
object_prediction = ObjectPrediction(
bbox=box,
category_name=CLASS[int(cls)],
category_id=int(cls),
score=area_rate,
)
object_predictions.append(object_prediction)
det_ind += 1
if area_rate >= most_close:
out_img = image.crop(tuple(box)).resize((image.width, image.height))
most_close = area_rate
if area_rate >= area_thres:
html_bot_voice = tts(defaul_bot_voice, language="ja")
# result = visualize_object_predictions(
# image=np_image,
# object_prediction_list=object_predictions,
# rect_th=2,
# text_th=2,
# )
# return Image.fromarray(result["image"]), html_bot_voice
return out_img, html_bot_voice
outputs = [gr.Image(type="filepath", label="Robot View"),
gr.HTML()]
title = "Detomo Aisatsu Robot"
demo_app = gr.Interface(
fn=yolov8_inference,
inputs=gr.Image(source="webcam", streaming=True, label="Input Image"),
outputs=outputs,
title=title,
live=True,
)
demo_app.launch(debug=True)