from gtts import gTTS from io import BytesIO import base64 from PIL import Image import cv2 import numpy as np import gradio as gr from ultralyticsplus import YOLO from base64 import b64encode from speech_recognition import AudioFile, Recognizer import numpy as np from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist from scipy.spatial import distance as dist model = YOLO('ultralyticsplus/yolov8s') CLASS = model.model.names defaul_bot_voice = "おはいようございます" area_thres = 0.3 def infer(image, last_seen): results = model.predict(image, show=False)[0] masks, boxes = results.masks, results.boxes area_image = image.width * image.height voice_bot = None most_close = 0 out_img = None diff_value = 0.5 if boxes is not None: for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls): if int(cls) != 0: continue box = xyxy.tolist() area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image if area_rate >= most_close: out_img = image.crop(tuple(box)).resize((64, 64)) most_close = area_rate if last_seen != "": last_seen = base64_to_pil(last_seen) if out_img is not None: diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen)) print(most_close, diff_value) if most_close >= area_thres and diff_value >= 0.5: voice_bot = tts(defaul_bot_voice, language="ja") return out_img, voice_bot iface = gr.Interface( fn=infer, title="aisatsu api", inputs=[gr.Image(label="image", type="pil", shape=(320, 320)), gr.Textbox(label="last seen", value="")], outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice")], article = "Author: Vu Minh Chien.", ).launch(enable_queue=True, debug=True)