Spaces:

IliaLarchenko
/

interviewer

Sleeping

App Files Files Community

IliaLarchenko commited on May 19

Commit

6bb887d

•

1 Parent(s): da72dc0

Improved STT logic

Browse files

Files changed (5) hide show

api/audio.py +46 -13
app.py +1 -0
requirements.txt +1 -0
ui/coding.py +34 -31
utils/ui.py +2 -1

api/audio.py CHANGED Viewed

@@ -8,6 +8,28 @@ from openai import OpenAI
 from utils.errors import APIError, AudioConversionError
 from typing import List, Dict, Optional, Generator, Tuple
 class STTManager:
@@ -42,9 +64,7 @@ class STTManager:
             raise AudioConversionError(f"Error converting numpy array to audio bytes: {e}")
         return buffer.getvalue()
-    def process_audio_chunk(
-        self, audio: Tuple[int, np.ndarray], audio_buffer: np.ndarray, transcript: Dict
-    ) -> Tuple[Dict, np.ndarray, str]:
         """
         Process streamed audio data to accumulate and transcribe with overlapping segments.
@@ -53,15 +73,26 @@ class STTManager:
         :param transcript: Current transcript dictionary.
         :return: Updated transcript, updated audio buffer, and transcript text.
         """
-        audio_buffer = np.concatenate((audio_buffer, audio[1]))
-        if len(audio_buffer) >= self.SAMPLE_RATE * self.CHUNK_LENGTH or len(audio_buffer) % (self.SAMPLE_RATE // 2) != 0:
-            audio_bytes = self.numpy_audio_to_bytes(audio_buffer[: self.SAMPLE_RATE * self.CHUNK_LENGTH])
-            audio_buffer = audio_buffer[self.SAMPLE_RATE * self.STEP_LENGTH :]
-            new_transcript = self.speech_to_text_stream(audio_bytes)
-            transcript = self.merge_transcript(transcript, new_transcript)
-        return transcript, audio_buffer, transcript["text"]
     def speech_to_text_stream(self, audio: bytes) -> List[Dict[str, str]]:
         """
@@ -114,19 +145,21 @@ class STTManager:
         transcript["text"] = " ".join(transcript["words"])
         return transcript
-    def speech_to_text_full(self, audio: Tuple[int, np.ndarray]) -> str:
         """
         Convert speech to text from a full audio segment.
         :param audio: Tuple containing the sample rate and audio data as numpy array.
         :return: Transcribed text.
         """
-        audio_bytes = self.numpy_audio_to_bytes(audio[1])
         try:
             if self.config.stt.type == "OPENAI_API":
                 data = ("temp.wav", audio_bytes, "audio/wav")
                 client = OpenAI(base_url=self.config.stt.url, api_key=self.config.stt.key)
-                transcription = client.audio.transcriptions.create(model=self.config.stt.name, file=data, response_format="text")
             elif self.config.stt.type == "HF_API":
                 headers = {"Authorization": "Bearer " + self.config.stt.key}
                 response = requests.post(self.config.stt.url, headers=headers, data=audio_bytes)

 from utils.errors import APIError, AudioConversionError
 from typing import List, Dict, Optional, Generator, Tuple
+import webrtcvad
+def detect_voice(audio: np.ndarray, sample_rate: int = 48000, frame_duration: int = 30) -> bool:
+    vad = webrtcvad.Vad()
+    vad.set_mode(3)  # Aggressiveness mode: 0 (least aggressive) to 3 (most aggressive)
+    # Convert numpy array to 16-bit PCM bytes
+    audio_bytes = audio.tobytes()
+    num_samples_per_frame = int(sample_rate * frame_duration / 1000)
+    frames = [audio_bytes[i : i + num_samples_per_frame * 2] for i in range(0, len(audio_bytes), num_samples_per_frame * 2)]
+    count_speech = 0
+    for frame in frames:
+        if len(frame) < num_samples_per_frame * 2:
+            continue
+        if vad.is_speech(frame, sample_rate):
+            count_speech += 1
+            if count_speech > 6:
+                return True
+    return False
 class STTManager:
             raise AudioConversionError(f"Error converting numpy array to audio bytes: {e}")
         return buffer.getvalue()
+    def process_audio_chunk(self, audio: Tuple[int, np.ndarray], audio_buffer: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
         """
         Process streamed audio data to accumulate and transcribe with overlapping segments.
         :param transcript: Current transcript dictionary.
         :return: Updated transcript, updated audio buffer, and transcript text.
         """
+        has_voice = detect_voice(audio[1])
+        ended = len(audio[1]) % 24000 != 0
+        if has_voice:
+            audio_buffer = np.concatenate((audio_buffer, audio[1]))
+        is_short = len(audio_buffer) / 48000 < 1.0
+        if is_short or (has_voice and not ended):
+            return audio_buffer, np.array([], dtype=np.int16)
+        return np.array([], dtype=np.int16), audio_buffer
+    def transcribe_audio(self, audio: np.ndarray, text) -> str:
+        if len(audio) < 500:
+            return text
+        else:
+            transcript = self.transcribe_numpy_array(audio, context=text)
+            return text + " " + transcript
     def speech_to_text_stream(self, audio: bytes) -> List[Dict[str, str]]:
         """
         transcript["text"] = " ".join(transcript["words"])
         return transcript
+    def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
         """
         Convert speech to text from a full audio segment.
         :param audio: Tuple containing the sample rate and audio data as numpy array.
         :return: Transcribed text.
         """
+        audio_bytes = self.numpy_audio_to_bytes(audio)
         try:
             if self.config.stt.type == "OPENAI_API":
                 data = ("temp.wav", audio_bytes, "audio/wav")
                 client = OpenAI(base_url=self.config.stt.url, api_key=self.config.stt.key)
+                transcription = client.audio.transcriptions.create(
+                    model=self.config.stt.name, file=data, response_format="text", prompt=context
+                )
             elif self.config.stt.type == "HF_API":
                 headers = {"Authorization": "Bearer " + self.config.stt.key}
                 response = requests.post(self.config.stt.url, headers=headers, data=audio_bytes)

app.py CHANGED Viewed

@@ -35,6 +35,7 @@ def main():
     """Main function to initialize services and launch the Gradio interface."""
     config, llm, tts, stt = initialize_services()
     demo = create_interface(llm, tts, stt, default_audio_params)
     demo.launch(show_api=False)

     """Main function to initialize services and launch the Gradio interface."""
     config, llm, tts, stt = initialize_services()
     demo = create_interface(llm, tts, stt, default_audio_params)
+    demo.config["dependencies"][0]["show_progress"] = "hidden"
     demo.launch(show_api=False)

requirements.txt CHANGED Viewed

@@ -2,3 +2,4 @@ gradio==4.29.0
 openai==1.19.0
 python-dotenv==1.0.1
 pytest==8.2.0

 openai==1.19.0
 python-dotenv==1.0.1
 pytest==8.2.0
+webrtcvad=2.0.10

ui/coding.py CHANGED Viewed

@@ -3,11 +3,14 @@ import numpy as np
 import os
 from itertools import chain
 from resources.data import fixed_messages, topic_lists
 from utils.ui import add_candidate_message, add_interviewer_message
 from typing import List, Dict, Generator, Optional, Tuple
 from functools import partial
 def send_request(
@@ -15,8 +18,8 @@ def send_request(
     previous_code: str,
     chat_history: List[Dict[str, str]],
     chat_display: List[List[Optional[str]]],
-    llm,
-    tts,
     silent: Optional[bool] = False,
 ) -> Generator[Tuple[List[Dict[str, str]], List[List[Optional[str]]], str, bytes], None, None]:
     """
@@ -26,14 +29,19 @@ def send_request(
     if silent is None:
         silent = os.getenv("SILENT", False)
     chat_history = llm.update_chat_history(code, previous_code, chat_history, chat_display)
     original_len = len(chat_display)
     chat_display.append([None, ""])
-    chat_history.append({"role": "assistant", "content": ""})
     text_chunks = []
     reply = llm.get_text(chat_history)
     audio_generator = iter(())
     has_text_item = True
     has_audio_item = not silent
@@ -99,7 +107,7 @@ def change_code_area(interview_type):
         )
-def get_problem_solving_ui(llm, tts, stt, default_audio_params, audio_output):
     send_request_partial = partial(send_request, llm=llm, tts=tts)
     with gr.Tab("Interview", render=False, elem_id=f"tab") as problem_tab:
@@ -178,20 +186,22 @@ def get_problem_solving_ui(llm, tts, stt, default_audio_params, audio_output):
                 with gr.Column(scale=1):
                     end_btn = gr.Button("Finish the interview", interactive=False, variant="stop", elem_id=f"end_btn")
                     chat = gr.Chatbot(label="Chat", show_label=False, show_share_button=False, elem_id=f"chat")
                     message = gr.Textbox(
                         label="Message",
                         show_label=False,
-                        lines=3,
-                        max_lines=3,
-                        interactive=True,
                         container=False,
                         elem_id=f"message",
                     )
-                    send_btn = gr.Button("Send", interactive=False, elem_id=f"send_btn")
-                    audio_input = gr.Audio(interactive=False, **default_audio_params, elem_id=f"audio_input")
                     audio_buffer = gr.State(np.array([], dtype=np.int16))
-                    transcript = gr.State({"words": [], "not_confirmed": 0, "last_cutoff": 0, "text": ""})
         with gr.Accordion("Feedback", open=True, visible=False) as feedback_acc:
             feedback = gr.Markdown(elem_id=f"feedback", line_breaks=True)
@@ -219,8 +229,8 @@ def get_problem_solving_ui(llm, tts, stt, default_audio_params, audio_output):
         ).success(
             fn=llm.init_bot, inputs=[description, interview_type_select], outputs=[chat_history]
         ).success(
-            fn=lambda: (gr.update(visible=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)),
-            outputs=[solution_acc, end_btn, audio_input, send_btn],
         )
         end_btn.click(fn=lambda x: add_candidate_message("Let's stop here.", x), inputs=[chat], outputs=[chat]).success(
@@ -233,9 +243,8 @@ def get_problem_solving_ui(llm, tts, stt, default_audio_params, audio_output):
                 gr.update(interactive=False),
                 gr.update(open=False),
                 gr.update(interactive=False),
-                gr.update(interactive=False),
             ),
-            outputs=[solution_acc, end_btn, problem_acc, audio_input, send_btn],
         ).success(
             fn=lambda: (gr.update(visible=True)),
             outputs=[feedback_acc],
@@ -243,32 +252,26 @@ def get_problem_solving_ui(llm, tts, stt, default_audio_params, audio_output):
             fn=llm.end_interview, inputs=[description, chat_history, interview_type_select], outputs=[feedback]
         )
-        send_btn.click(fn=add_candidate_message, inputs=[message, chat], outputs=[chat]).success(
-            fn=lambda: None, outputs=[message]
         ).success(
             fn=send_request_partial,
             inputs=[code, previous_code, chat_history, chat],
             outputs=[chat_history, chat, previous_code, audio_output],
-            # ).success(
-            #     fn=tts.read_last_message, inputs=[chat], outputs=[audio_output]
         ).success(
             fn=lambda: np.array([], dtype=np.int16), outputs=[audio_buffer]
         ).success(
-            fn=lambda: {"words": [], "not_confirmed": 0, "last_cutoff": 0, "text": ""}, outputs=[transcript]
         )
-        if stt.streaming:
-            audio_input.stream(
-                stt.process_audio_chunk,
-                inputs=[audio_input, audio_buffer, transcript],
-                outputs=[transcript, audio_buffer, message],
-                show_progress="hidden",
-            )
-        else:
-            audio_input.stop_recording(fn=stt.speech_to_text_full, inputs=[audio_input], outputs=[message]).success(
-                fn=lambda: gr.update(interactive=True), outputs=[send_btn]
-            ).success(fn=lambda: None, outputs=[audio_input])
         interview_type_select.change(
             fn=lambda x: gr.update(choices=topic_lists[x], value=np.random.choice(topic_lists[x])),
             inputs=[interview_type_select],

 import os
 from itertools import chain
+import time
 from resources.data import fixed_messages, topic_lists
 from utils.ui import add_candidate_message, add_interviewer_message
 from typing import List, Dict, Generator, Optional, Tuple
 from functools import partial
+from api.llm import LLMManager
+from api.audio import TTSManager, STTManager
 def send_request(
     previous_code: str,
     chat_history: List[Dict[str, str]],
     chat_display: List[List[Optional[str]]],
+    llm: LLMManager,
+    tts: Optional[TTSManager],
     silent: Optional[bool] = False,
 ) -> Generator[Tuple[List[Dict[str, str]], List[List[Optional[str]]], str, bytes], None, None]:
     """
     if silent is None:
         silent = os.getenv("SILENT", False)
+    if chat_display[-1][0] is None and code == previous_code:
+        yield chat_history, chat_display, code, b""
+        return
     chat_history = llm.update_chat_history(code, previous_code, chat_history, chat_display)
     original_len = len(chat_display)
     chat_display.append([None, ""])
     text_chunks = []
     reply = llm.get_text(chat_history)
+    chat_history.append({"role": "assistant", "content": ""})
     audio_generator = iter(())
     has_text_item = True
     has_audio_item = not silent
         )
+def get_problem_solving_ui(llm: LLMManager, tts: TTSManager, stt: STTManager, default_audio_params: Dict, audio_output):
     send_request_partial = partial(send_request, llm=llm, tts=tts)
     with gr.Tab("Interview", render=False, elem_id=f"tab") as problem_tab:
                 with gr.Column(scale=1):
                     end_btn = gr.Button("Finish the interview", interactive=False, variant="stop", elem_id=f"end_btn")
                     chat = gr.Chatbot(label="Chat", show_label=False, show_share_button=False, elem_id=f"chat")
+                    # I need this message box only because chat component is flickering when I am updating it
+                    # To be improved in the future
                     message = gr.Textbox(
                         label="Message",
                         show_label=False,
+                        lines=5,
+                        max_lines=5,
+                        interactive=False,
                         container=False,
                         elem_id=f"message",
                     )
+                    audio_input = gr.Audio(interactive=False, **default_audio_params, elem_id=f"audio_input")
                     audio_buffer = gr.State(np.array([], dtype=np.int16))
+                    audio_to_transcribe = gr.State(np.array([], dtype=np.int16))
         with gr.Accordion("Feedback", open=True, visible=False) as feedback_acc:
             feedback = gr.Markdown(elem_id=f"feedback", line_breaks=True)
         ).success(
             fn=llm.init_bot, inputs=[description, interview_type_select], outputs=[chat_history]
         ).success(
+            fn=lambda: (gr.update(visible=True), gr.update(interactive=True), gr.update(interactive=True)),
+            outputs=[solution_acc, end_btn, audio_input],
         )
         end_btn.click(fn=lambda x: add_candidate_message("Let's stop here.", x), inputs=[chat], outputs=[chat]).success(
                 gr.update(interactive=False),
                 gr.update(open=False),
                 gr.update(interactive=False),
             ),
+            outputs=[solution_acc, end_btn, problem_acc, audio_input],
         ).success(
             fn=lambda: (gr.update(visible=True)),
             outputs=[feedback_acc],
             fn=llm.end_interview, inputs=[description, chat_history, interview_type_select], outputs=[feedback]
         )
+        audio_input.stream(
+            stt.process_audio_chunk,
+            inputs=[audio_input, audio_buffer],
+            outputs=[audio_buffer, audio_to_transcribe],
+            show_progress="hidden",
+        ).success(fn=stt.transcribe_audio, inputs=[audio_to_transcribe, message], outputs=[message], show_progress="hidden")
+        # TODO: find a way to remove delay
+        audio_input.stop_recording(fn=lambda: time.sleep(2)).success(
+            fn=add_candidate_message, inputs=[message, chat], outputs=[chat]
         ).success(
             fn=send_request_partial,
             inputs=[code, previous_code, chat_history, chat],
             outputs=[chat_history, chat, previous_code, audio_output],
         ).success(
             fn=lambda: np.array([], dtype=np.int16), outputs=[audio_buffer]
         ).success(
+            lambda: "", outputs=[message]
         )
         interview_type_select.change(
             fn=lambda x: gr.update(choices=topic_lists[x], value=np.random.choice(topic_lists[x])),
             inputs=[interview_type_select],

utils/ui.py CHANGED Viewed

@@ -8,7 +8,8 @@ def add_interviewer_message(message):
 def add_candidate_message(message, chat):
-    chat.append((message, None))
     return chat

 def add_candidate_message(message, chat):
+    if message and len(message) > 0:
+        chat.append((message, None))
     return chat