Spaces:
Sleeping
Sleeping
File size: 3,146 Bytes
443cd8b bbca647 443cd8b bbca647 443cd8b 5ce40d5 443cd8b d3c6fe7 443cd8b 7b6cc37 443cd8b 5ce40d5 443cd8b 7b6cc37 443cd8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import gradio as gr
import torch
from ultralyticsplus import YOLO
import numpy as np
from PIL import Image
from base64 import b64encode
from io import BytesIO
from gtts import gTTS
from mtranslate import translate
from speech_recognition import AudioFile, Recognizer
import time
from sahi.prediction import ObjectPrediction, PredictionScore
from sahi.utils.cv import (
get_bool_mask_from_coco_segmentation,
read_image_as_pil,
visualize_object_predictions,
)
model = YOLO('ultralyticsplus/yolov8s')
CLASS = model.model.names
def tts(text: str, language="ja") -> object:
"""Converts text into autoplay html.
Args:
text (str): generated answer of bot
Returns:
html: autoplay object
"""
tts_object = gTTS(text=text, lang=language, slow=False)
bytes_object = BytesIO()
tts_object.write_to_fp(bytes_object)
bytes_object.seek(0)
b64 = b64encode(bytes_object.getvalue()).decode()
html = f"""
<audio controls autoplay>
<source src="data:audio/wav;base64,{b64}" type="audio/wav">
</audio>
"""
return html
def yolov8_inference(
image,
area_thres=0.35,
defaul_bot_voice="おはいようございます"
):
"""
YOLOv8 inference function
Args:
image: Input image
Returns:
Rendered image
"""
# time.sleep(1)
# set model parameters
model.overrides['conf'] = 0.25 # NMS confidence threshold
model.overrides['iou'] = 0.45 # NMS IoU threshold
model.overrides['agnostic_nms'] = False # NMS class-agnostic
model.overrides['max_det'] = 1000 # maximum number of detections per image
results = model.predict(image, show=False)[0]
image = read_image_as_pil(image)
np_image = np.ascontiguousarray(image)
masks, boxes = results.masks, results.boxes
area_image = image.width*image.height
object_predictions = []
html_bot_voice = ""
if boxes is not None:
det_ind = 0
for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
if int(cls) != 0:
continue
box = xyxy.tolist()
area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
object_prediction = ObjectPrediction(
bbox=box,
category_name=CLASS[int(cls)],
category_id=int(cls),
score=area_rate,
)
object_predictions.append(object_prediction)
det_ind += 1
if area_rate >= area_thres:
html_bot_voice = tts(defaul_bot_voice, language="ja")
result = visualize_object_predictions(
image=np_image,
object_prediction_list=object_predictions,
rect_th=2,
text_th=2,
)
return Image.fromarray(result["image"]), html_bot_voice
outputs = [gr.Image(type="filepath", label="Robot View"),
gr.HTML()]
title = "Detomo Aisatsu Robot"
demo_app = gr.Interface(
fn=yolov8_inference,
inputs=gr.Image(source="webcam", streaming=True, label="Input Image"),
outputs=outputs,
title=title,
live=True,
)
demo_app.launch(debug=True) |