IliaLarchenko commited on
Commit
3a5dbe6
1 Parent(s): 43d5e00

Added STT streaming

Browse files
Files changed (2) hide show
  1. api/audio.py +86 -24
  2. app.py +34 -5
api/audio.py CHANGED
@@ -10,31 +10,87 @@ from openai import OpenAI
10
  from utils.errors import APIError, AudioConversionError
11
 
12
 
13
- def numpy_audio_to_bytes(audio_data):
14
- sample_rate = 44100
15
- num_channels = 1
16
- sampwidth = 2
17
-
18
- buffer = io.BytesIO()
19
- try:
20
- with wave.open(buffer, "wb") as wf:
21
- wf.setnchannels(num_channels)
22
- wf.setsampwidth(sampwidth)
23
- wf.setframerate(sample_rate)
24
- wf.writeframes(audio_data.tobytes())
25
- except Exception as e:
26
- raise AudioConversionError(f"Error converting numpy array to audio bytes: {e}")
27
- return buffer.getvalue()
28
-
29
-
30
  class STTManager:
31
  def __init__(self, config):
 
 
 
 
 
32
  self.config = config
33
  self.status = self.test_stt()
34
- self.streaming = False
 
 
 
 
35
 
36
- def speech_to_text(self, audio):
37
- audio = numpy_audio_to_bytes(audio[1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  try:
39
  if self.config.stt.type == "OPENAI_API":
40
  data = ("temp.wav", audio, "audio/wav")
@@ -58,14 +114,20 @@ class STTManager:
58
 
59
  def test_stt(self):
60
  try:
61
- self.speech_to_text((48000, np.zeros(10000)))
 
 
 
 
 
 
 
62
  return True
63
  except:
64
  return False
65
 
66
- def add_user_message(self, audio, chat_display):
67
- transcription = self.speech_to_text(audio)
68
- chat_display.append([transcription, None])
69
  return chat_display
70
 
71
 
 
10
  from utils.errors import APIError, AudioConversionError
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class STTManager:
14
  def __init__(self, config):
15
+ self.SAMPLE_RATE = 48000
16
+ self.CHUNK_LENGTH = 5
17
+ self.STEP_LENGTH = 3
18
+ self.MAX_RELIABILITY_CUTOFF = self.CHUNK_LENGTH - 1
19
+
20
  self.config = config
21
  self.status = self.test_stt()
22
+ self.streaming = self.test_streaming()
23
+
24
+ def numpy_audio_to_bytes(self, audio_data):
25
+ num_channels = 1
26
+ sampwidth = 2
27
 
28
+ buffer = io.BytesIO()
29
+ try:
30
+ with wave.open(buffer, "wb") as wf:
31
+ wf.setnchannels(num_channels)
32
+ wf.setsampwidth(sampwidth)
33
+ wf.setframerate(self.SAMPLE_RATE)
34
+ wf.writeframes(audio_data.tobytes())
35
+ except Exception as e:
36
+ raise AudioConversionError(f"Error converting numpy array to audio bytes: {e}")
37
+ return buffer.getvalue()
38
+
39
+ def process_audio_chunk(self, audio, audio_buffer, transcript):
40
+ """Process streamed audio data to accumulate and transcribe with overlapping segments."""
41
+ audio_buffer = np.concatenate((audio_buffer, audio[1]))
42
+
43
+ if len(audio_buffer) >= self.SAMPLE_RATE * self.CHUNK_LENGTH or len(audio_buffer) % (self.SAMPLE_RATE // 2) != 0:
44
+ audio_bytes = self.numpy_audio_to_bytes(audio_buffer[: self.SAMPLE_RATE * self.CHUNK_LENGTH])
45
+ audio_buffer = audio_buffer[self.SAMPLE_RATE * self.STEP_LENGTH :]
46
+
47
+ new_transcript = self.speech_to_text_stream(audio_bytes)
48
+ transcript = self.merge_transcript(transcript, new_transcript)
49
+
50
+ return transcript, audio_buffer, transcript["text"]
51
+
52
+ def speech_to_text_stream(self, audio):
53
+ if self.config.stt.type == "HF_API":
54
+ raise APIError("STT Error: Streaming not supported for this STT type")
55
+ try:
56
+ data = ("temp.wav", audio, "audio/wav")
57
+ client = OpenAI(base_url=self.config.stt.url, api_key=self.config.stt.key)
58
+ transcription = client.audio.transcriptions.create(
59
+ model=self.config.stt.name, file=data, response_format="verbose_json", timestamp_granularities=["word"]
60
+ )
61
+ except APIError as e:
62
+ raise
63
+ except Exception as e:
64
+ raise APIError(f"STT Error: Unexpected error: {e}")
65
+ return transcription.words
66
+
67
+ def merge_transcript(self, transcript, new_transcript):
68
+ cut_off = transcript["last_cutoff"]
69
+ transcript["last_cutoff"] = self.MAX_RELIABILITY_CUTOFF - self.STEP_LENGTH
70
+
71
+ transcript["words"] = transcript["words"][: len(transcript["words"]) - transcript["not_confirmed"]]
72
+
73
+ transcript["not_confirmed"] = 0
74
+ first_word = True
75
+
76
+ for word_dict in new_transcript:
77
+ if word_dict["start"] >= cut_off:
78
+ if first_word:
79
+ if len(transcript["words"]) > 0 and transcript["words"][-1] == word_dict["word"]:
80
+ continue
81
+ first_word = False
82
+ transcript["words"].append(word_dict["word"])
83
+ if word_dict["start"] > self.MAX_RELIABILITY_CUTOFF:
84
+ transcript["not_confirmed"] += 1
85
+ else:
86
+ transcript["last_cutoff"] = max(1.0, word_dict["end"] - self.STEP_LENGTH)
87
+
88
+ transcript["text"] = " ".join(transcript["words"])
89
+
90
+ return transcript
91
+
92
+ def speech_to_text_full(self, audio):
93
+ audio = self.numpy_audio_to_bytes(audio[1])
94
  try:
95
  if self.config.stt.type == "OPENAI_API":
96
  data = ("temp.wav", audio, "audio/wav")
 
114
 
115
  def test_stt(self):
116
  try:
117
+ self.speech_to_text_full((48000, np.zeros(10000)))
118
+ return True
119
+ except:
120
+ return False
121
+
122
+ def test_streaming(self):
123
+ try:
124
+ self.speech_to_text_stream(self.numpy_audio_to_bytes(np.zeros(10000)))
125
  return True
126
  except:
127
  return False
128
 
129
+ def add_user_message(self, message, chat_display):
130
+ chat_display.append([message, None])
 
131
  return chat_display
132
 
133
 
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
 
3
  import gradio as gr
 
4
 
5
  from api.audio import STTManager, TTSManager
6
  from api.llm import LLMManager
@@ -22,6 +23,7 @@ default_audio_params = {
22
  "editable": False,
23
  "container": False,
24
  "show_share_button": False,
 
25
  }
26
 
27
 
@@ -125,13 +127,25 @@ with gr.Blocks(title="AI Interviewer") as demo:
125
  code = gr.Code(
126
  label="Please write your code here. You can use any language, but only Python syntax highlighting is available.",
127
  language="python",
128
- lines=35,
129
  )
130
  with gr.Column(scale=1):
131
  end_btn = gr.Button("Finish the interview", interactive=False)
132
  chat = gr.Chatbot(label="Chat", show_label=False, show_share_button=False)
 
 
 
 
 
 
 
 
 
133
  audio_input = gr.Audio(interactive=False, **default_audio_params)
134
 
 
 
 
135
  with gr.Accordion("Feedback", open=True) as feedback_acc:
136
  feedback = gr.Markdown()
137
 
@@ -165,14 +179,29 @@ with gr.Blocks(title="AI Interviewer") as demo:
165
  fn=llm.end_interview, inputs=[description, chat_history], outputs=[feedback]
166
  )
167
 
168
- audio_input.stop_recording(fn=stt.add_user_message, inputs=[audio_input, chat], outputs=[chat]).success(
169
- fn=lambda: None, outputs=[audio_input]
170
- ).success(
171
  fn=llm.send_request,
172
  inputs=[code, previous_code, chat_history, chat],
173
  outputs=[chat_history, chat, previous_code],
 
 
174
  ).success(
175
- fn=tts.read_last_message, inputs=[chat], outputs=[audio_output]
 
 
176
  )
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  demo.launch(show_api=False)
 
1
  import os
2
 
3
  import gradio as gr
4
+ import numpy as np
5
 
6
  from api.audio import STTManager, TTSManager
7
  from api.llm import LLMManager
 
23
  "editable": False,
24
  "container": False,
25
  "show_share_button": False,
26
+ "streaming": stt.streaming,
27
  }
28
 
29
 
 
127
  code = gr.Code(
128
  label="Please write your code here. You can use any language, but only Python syntax highlighting is available.",
129
  language="python",
130
+ lines=46,
131
  )
132
  with gr.Column(scale=1):
133
  end_btn = gr.Button("Finish the interview", interactive=False)
134
  chat = gr.Chatbot(label="Chat", show_label=False, show_share_button=False)
135
+ message = gr.Textbox(
136
+ label="Message",
137
+ placeholder="Your message will appear here",
138
+ show_label=False,
139
+ lines=3,
140
+ max_lines=3,
141
+ interactive=False,
142
+ )
143
+ send_btn = gr.Button("Send", interactive=False)
144
  audio_input = gr.Audio(interactive=False, **default_audio_params)
145
 
146
+ audio_buffer = gr.State(np.array([], dtype=np.int16))
147
+ transcript = gr.State({"words": [], "not_confirmed": 0, "last_cutoff": 0, "text": ""})
148
+
149
  with gr.Accordion("Feedback", open=True) as feedback_acc:
150
  feedback = gr.Markdown()
151
 
 
179
  fn=llm.end_interview, inputs=[description, chat_history], outputs=[feedback]
180
  )
181
 
182
+ send_btn.click(fn=stt.add_user_message, inputs=[message, chat], outputs=[chat]).success(fn=lambda: None, outputs=[message]).success(
 
 
183
  fn=llm.send_request,
184
  inputs=[code, previous_code, chat_history, chat],
185
  outputs=[chat_history, chat, previous_code],
186
+ ).success(fn=tts.read_last_message, inputs=[chat], outputs=[audio_output]).success(
187
+ fn=lambda: gr.Button("Send", interactive=False), outputs=[send_btn]
188
  ).success(
189
+ fn=lambda: np.array([], dtype=np.int16), outputs=[audio_buffer]
190
+ ).success(
191
+ fn=lambda: {"words": [], "not_confirmed": 0, "last_cutoff": 0, "text": ""}, outputs=[transcript]
192
  )
193
 
194
+ if stt.streaming:
195
+ audio_input.stream(
196
+ stt.process_audio_chunk,
197
+ inputs=[audio_input, audio_buffer, transcript],
198
+ outputs=[transcript, audio_buffer, message],
199
+ show_progress="hidden",
200
+ )
201
+ audio_input.stop_recording(fn=lambda: gr.Button("Send", interactive=True), outputs=[send_btn])
202
+ else:
203
+ audio_input.stop_recording(fn=stt.speech_to_text_full, inputs=[audio_input], outputs=[message]).success(
204
+ fn=lambda: gr.Button("Send", interactive=True), outputs=[send_btn]
205
+ ).success(fn=lambda: None, outputs=[audio_input])
206
+
207
  demo.launch(show_api=False)