import gradio as gr import torch from ultralyticsplus import YOLO import numpy as np from PIL import Image from base64 import b64encode from io import BytesIO from gtts import gTTS from mtranslate import translate from speech_recognition import AudioFile, Recognizer import time from sahi.prediction import ObjectPrediction, PredictionScore from sahi.utils.cv import ( get_bool_mask_from_coco_segmentation, read_image_as_pil, visualize_object_predictions, ) model = YOLO('ultralyticsplus/yolov8s') CLASS = model.model.names def tts(text: str, language="ja") -> object: """Converts text into autoplay html. Args: text (str): generated answer of bot Returns: html: autoplay object """ tts_object = gTTS(text=text, lang=language, slow=False) bytes_object = BytesIO() tts_object.write_to_fp(bytes_object) bytes_object.seek(0) b64 = b64encode(bytes_object.getvalue()).decode() html = f""" """ return html def yolov8_inference( image, area_thres=0.35, defaul_bot_voice="おはいようございます" ): """ YOLOv8 inference function Args: image: Input image Returns: Rendered image """ # time.sleep(1) # set model parameters model.overrides['conf'] = 0.25 # NMS confidence threshold model.overrides['iou'] = 0.45 # NMS IoU threshold model.overrides['agnostic_nms'] = False # NMS class-agnostic model.overrides['max_det'] = 1000 # maximum number of detections per image results = model.predict(image, show=False)[0] image = read_image_as_pil(image) np_image = np.ascontiguousarray(image) masks, boxes = results.masks, results.boxes area_image = image.width*image.height object_predictions = [] html_bot_voice = "" most_close = 0 if boxes is not None: det_ind = 0 for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls): if int(cls) != 0: continue box = xyxy.tolist() area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image object_prediction = ObjectPrediction( bbox=box, category_name=CLASS[int(cls)], category_id=int(cls), score=area_rate, ) object_predictions.append(object_prediction) det_ind += 1 if area_rate >= most_close: out_img = image.crop(tuple(box)).resize((image.width, image.height)) most_close = area_rate if area_rate >= area_thres: html_bot_voice = tts(defaul_bot_voice, language="ja") # result = visualize_object_predictions( # image=np_image, # object_prediction_list=object_predictions, # rect_th=2, # text_th=2, # ) # return Image.fromarray(result["image"]), html_bot_voice return out_img, html_bot_voice outputs = [gr.Image(type="filepath", label="Robot View"), gr.HTML()] title = "Detomo Aisatsu Robot" demo_app = gr.Interface( fn=yolov8_inference, inputs=gr.Image(source="webcam", streaming=True, label="Input Image"), outputs=outputs, title=title, live=True, ) demo_app.launch(debug=True)