vumichien commited on
Commit
1352988
1 Parent(s): cd09ca8

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +76 -34
main.py CHANGED
@@ -1,29 +1,30 @@
1
- import time
 
2
 
3
- from ultralytics import YOLO
4
- from base64 import b64encode
5
- from speech_recognition import AudioFile, Recognizer
6
- import numpy as np
7
  from scipy.spatial import distance as dist
8
- from typing import Union, Optional
9
-
10
  from fastapi import FastAPI, File, UploadFile, Form
11
  from fastapi.responses import StreamingResponse
12
  from fastapi.middleware.gzip import GZipMiddleware
13
-
14
- from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
15
- from huggingface_hub import hf_hub_download
16
-
17
  from io import BytesIO
 
18
  import zipfile
 
 
19
 
20
- model_path = hf_hub_download(repo_id="ultralyticsplus/yolov8s", filename='yolov8s.pt')
21
- model = YOLO(model_path)
22
-
23
  CLASS = model.model.names
 
24
  default_bot_voice = "おはいようございます"
25
  area_threshold = 0.3
26
- ZIP = False
 
 
 
 
 
 
27
 
28
  app = FastAPI()
29
  app.add_middleware(GZipMiddleware, minimum_size=1000)
@@ -34,15 +35,18 @@ def read_root():
34
  return {"Message": "Application startup complete"}
35
 
36
 
37
- @app.post("/human_detect/")
38
- async def predict_api(
 
 
 
 
 
39
  file: UploadFile = File(...),
40
- # last_seen: Union[UploadFile, None] = File(None),
41
- last_seen: Optional[str] = Form(None),
42
  ):
43
  # parameters
44
  total_time = time.time()
45
- start_time = time.time()
46
  most_close = 0
47
  out_img = None
48
  diff_value = 0.5
@@ -63,27 +67,34 @@ async def predict_api(
63
  if area_rate >= most_close:
64
  out_img = image.crop(tuple(box)).resize((64, 64))
65
  most_close = area_rate
66
- print("Get face time", time.time() - start_time)
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  # check with previous image if have
69
- start_time = time.time()
70
  if last_seen is not None:
71
  if type(last_seen) == str:
72
  last_seen = base64_to_pil(last_seen)
73
  else:
74
  last_seen = read_image_file(await last_seen.read())
75
- if out_img is not None:
76
- diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen))
77
- print("Hist time", time.time() - start_time)
78
-
79
  # return results
80
- start_time = time.time()
81
- print(f"Distance: {most_close}. Different value: {diff_value}")
82
  if most_close >= area_threshold and diff_value >= 0.5:
83
  if ZIP:
84
- voice_bot_path = tts(default_bot_voice, language="ja")
85
- image_bot_path = pil_to_base64(out_img)
86
- print("Voice time", time.time() - start_time)
87
  io = BytesIO()
88
  zip_filename = "final_archive.zip"
89
  with zipfile.ZipFile(io, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
@@ -98,12 +109,43 @@ async def predict_api(
98
  )
99
  else:
100
  voice_bot_path = tts(default_bot_voice, language="ja", encode=True)
101
- image_bot_path = pil_to_base64(out_img, encode=True)
102
- print("Voice time", time.time() - start_time)
103
  print("Total time", time.time() - total_time)
104
  return {
 
 
105
  "voice": voice_bot_path,
106
  "image": image_bot_path
107
  }
108
  else:
109
- return {"message": "No face detected"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ultralyticsplus import YOLO
2
+ from typing import Optional, Union
3
 
 
 
 
 
4
  from scipy.spatial import distance as dist
5
+ import time
 
6
  from fastapi import FastAPI, File, UploadFile, Form
7
  from fastapi.responses import StreamingResponse
8
  from fastapi.middleware.gzip import GZipMiddleware
 
 
 
 
9
  from io import BytesIO
10
+ from utils import tts, stt, read_image_file, pil_to_base64, base64_to_pil, get_hist, ffmpeg_read
11
  import zipfile
12
+ import soundfile as sf
13
+ import openai
14
 
15
+ # Config for camera picture
16
+ model = YOLO('ultralyticsplus/yolov8s')
 
17
  CLASS = model.model.names
18
+ ZIP = False
19
  default_bot_voice = "おはいようございます"
20
  area_threshold = 0.3
21
+
22
+ # Config for human input
23
+ prompt_template = "私はあなたに、Detomo社が作ったロボットのように振る舞ってほしいです。あなたの名前はアイサツです。"\
24
+ "あなたのミッションは、子供たちが他の子供たちに挨拶する自信を持ち、幸せになることを助けることです。"\
25
+ "質問には簡単な方法でしか答えないようにし、明示的に要求されない限り、追加情報を提供しないでください。"
26
+ system_prompt = [{"role": "system", "content": prompt_template}]
27
+ openai.api_key = os.environ["OPENAI_API_KEY"]
28
 
29
  app = FastAPI()
30
  app.add_middleware(GZipMiddleware, minimum_size=1000)
 
35
  return {"Message": "Application startup complete"}
36
 
37
 
38
+ @app.get("/client_settings/")
39
+ def client_settings_api():
40
+ return {"camera_picture_period": 5}
41
+
42
+
43
+ @app.post("/camera_picture/")
44
+ async def camera_picture_api(
45
  file: UploadFile = File(...),
46
+ last_seen: Optional[Union[str, UploadFile]] = Form(None),
 
47
  ):
48
  # parameters
49
  total_time = time.time()
 
50
  most_close = 0
51
  out_img = None
52
  diff_value = 0.5
 
67
  if area_rate >= most_close:
68
  out_img = image.crop(tuple(box)).resize((64, 64))
69
  most_close = area_rate
70
+
71
+ # check detect people or not
72
+ if out_img is None:
73
+ return {
74
+ "status": "No face detected",
75
+ "text": None,
76
+ "voice": None,
77
+ "image": None
78
+ }
79
+ else:
80
+ if ZIP:
81
+ image_bot_path = pil_to_base64(out_img, encode=False)
82
+ else:
83
+ image_bot_path = pil_to_base64(out_img, encode=True)
84
 
85
  # check with previous image if have
 
86
  if last_seen is not None:
87
  if type(last_seen) == str:
88
  last_seen = base64_to_pil(last_seen)
89
  else:
90
  last_seen = read_image_file(await last_seen.read())
91
+ diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen))
92
+ print(f"Distance: {most_close}. Different value: {diff_value}")
93
+
 
94
  # return results
 
 
95
  if most_close >= area_threshold and diff_value >= 0.5:
96
  if ZIP:
97
+ voice_bot_path = tts(default_bot_voice, language="ja", encode=False)
 
 
98
  io = BytesIO()
99
  zip_filename = "final_archive.zip"
100
  with zipfile.ZipFile(io, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
 
109
  )
110
  else:
111
  voice_bot_path = tts(default_bot_voice, language="ja", encode=True)
 
 
112
  print("Total time", time.time() - total_time)
113
  return {
114
+ "status": "New people",
115
+ "text": default_bot_voice,
116
  "voice": voice_bot_path,
117
  "image": image_bot_path
118
  }
119
  else:
120
+ print("Total time", time.time() - total_time)
121
+ return {
122
+ "status": "Old people",
123
+ "text": None,
124
+ "voice": None,
125
+ "image": image_bot_path,
126
+ }
127
+
128
+
129
+ @app.post("/human_input/")
130
+ async def human_input_api(
131
+ input_data: Union[str, bytes],
132
+ temperature: float = 0.7,
133
+ max_tokens: int = 1000,
134
+ ):
135
+ print("Input data type", type(input_data))
136
+ if type(input_data) != str:
137
+ upload_audio = ffmpeg_read(input_data, sampling_rate=24000)
138
+ sf.write('temp.wav', upload_audio, 24000, subtype='PCM_16')
139
+ text = stt('temp.wav')
140
+ else:
141
+ text = input_data
142
+ prompt_msg = {"role": "user", "content": text}
143
+ messages = system_prompt + [prompt_msg]
144
+ completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, temperature=temperature,
145
+ max_tokens=max_tokens)
146
+ print(completion['usage']['total_tokens'])
147
+ return {
148
+ "human_text": str(text),
149
+ "robot_text": completion.choices[0].message.content,
150
+ "robot_voice": tts(completion.choices[0].message.content, language="ja", encode=True)
151
+ }