Spaces:

Detomo
/

Aisatsu-robot

Sleeping

App Files Files Community

Aisatsu-robot / app.py

vumichien

Update app.py

b135bd1 over 1 year ago

raw

history blame contribute delete

No virus

3.38 kB

	import gradio as gr
	import torch
	from ultralyticsplus import YOLO
	import numpy as np

	from PIL import Image
	from base64 import b64encode
	from io import BytesIO
	from gtts import gTTS
	from mtranslate import translate
	from speech_recognition import AudioFile, Recognizer
	import time

	from sahi.prediction import ObjectPrediction, PredictionScore
	from sahi.utils.cv import (
	get_bool_mask_from_coco_segmentation,
	read_image_as_pil,
	visualize_object_predictions,
	)

	model = YOLO('ultralyticsplus/yolov8s')
	CLASS = model.model.names

	def tts(text: str, language="ja") -> object:
	"""Converts text into autoplay html.
	Args:
	text (str): generated answer of bot
	Returns:
	html: autoplay object
	"""
	tts_object = gTTS(text=text, lang=language, slow=False)
	bytes_object = BytesIO()
	tts_object.write_to_fp(bytes_object)
	bytes_object.seek(0)
	b64 = b64encode(bytes_object.getvalue()).decode()
	html = f"""
	<audio controls autoplay>
	<source src="data:audio/wav;base64,{b64}" type="audio/wav">
	</audio>
	"""
	return html


	def yolov8_inference(
	image,
	area_thres=0.35,
	defaul_bot_voice="おはいようございます"
	):
	"""
	YOLOv8 inference function
	Args:
	image: Input image
	Returns:
	Rendered image
	"""
	# time.sleep(1)
	# set model parameters
	model.overrides['conf'] = 0.25 # NMS confidence threshold
	model.overrides['iou'] = 0.45 # NMS IoU threshold
	model.overrides['agnostic_nms'] = False # NMS class-agnostic
	model.overrides['max_det'] = 1000 # maximum number of detections per image
	results = model.predict(image, show=False)[0]
	image = read_image_as_pil(image)
	np_image = np.ascontiguousarray(image)
	masks, boxes = results.masks, results.boxes
	area_image = image.width*image.height
	object_predictions = []
	html_bot_voice = ""
	most_close = 0
	if boxes is not None:
	det_ind = 0
	for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
	if int(cls) != 0:
	continue
	box = xyxy.tolist()
	area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
	object_prediction = ObjectPrediction(
	bbox=box,
	category_name=CLASS[int(cls)],
	category_id=int(cls),
	score=area_rate,
	)
	object_predictions.append(object_prediction)
	det_ind += 1
	if area_rate >= most_close:
	out_img = image.crop(tuple(box)).resize((image.width, image.height))
	most_close = area_rate
	if area_rate >= area_thres:
	html_bot_voice = tts(defaul_bot_voice, language="ja")

	# result = visualize_object_predictions(
	# image=np_image,
	# object_prediction_list=object_predictions,
	# rect_th=2,
	# text_th=2,
	# )
	# return Image.fromarray(result["image"]), html_bot_voice
	return out_img, html_bot_voice


	outputs = [gr.Image(type="filepath", label="Robot View"),
	gr.HTML()]
	title = "Detomo Aisatsu Robot"

	demo_app = gr.Interface(
	fn=yolov8_inference,
	inputs=gr.Image(source="webcam", streaming=True, label="Input Image"),
	outputs=outputs,
	title=title,
	live=True,
	)
	demo_app.launch(debug=True)