Adipta commited on
Commit
27c3220
·
verified ·
1 Parent(s): b62e489
Files changed (7) hide show
  1. app.py +51 -0
  2. langchain_client.py +62 -0
  3. microphone.py +105 -0
  4. packages.txt +1 -0
  5. requirements.txt +9 -0
  6. tts.py +27 -0
  7. utils.py +35 -0
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from google.cloud import speech
4
+ from microphone import MicrophoneStream
5
+ from utils import listen_print_loop
6
+
7
+
8
+ # Audio recording parameters
9
+ RATE = 16000
10
+ CHUNK = int(RATE / 10) # 100ms
11
+ LANGUAGE = "id-ID"
12
+
13
+ transcribe_client = speech.SpeechClient()
14
+ config = speech.RecognitionConfig(
15
+ encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
16
+ sample_rate_hertz=RATE,
17
+ language_code=LANGUAGE,
18
+ )
19
+
20
+ streaming_config = speech.StreamingRecognitionConfig(
21
+ config=config, interim_results=True
22
+ )
23
+
24
+ async def main(audio) -> None:
25
+
26
+ print("Streaming started ...")
27
+
28
+ with MicrophoneStream(RATE, CHUNK) as stream:
29
+ audio_generator = stream.generator()
30
+ requests = (
31
+ speech.StreamingRecognizeRequest(audio_content=content)
32
+ for content in audio_generator
33
+ )
34
+
35
+ responses = transcribe_client.streaming_recognize(streaming_config, requests)
36
+
37
+ return await listen_print_loop(responses)
38
+
39
+ demo = gr.Interface(
40
+ fn=main,
41
+ inputs=[
42
+ gr.Audio(sources="microphone", streaming=True, label="Input Speech")
43
+ ],
44
+ outputs=[
45
+ gr.Textbox(label="Transcription"),
46
+ gr.Audio(label="Audio")
47
+ ],
48
+ live=True)
49
+
50
+ if __name__ == "__main__":
51
+ demo.launch()
langchain_client.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
6
+ from langchain_community.chat_message_histories import ChatMessageHistory
7
+ from langchain_core.chat_history import BaseChatMessageHistory
8
+ from langchain_core.runnables.history import RunnableWithMessageHistory
9
+
10
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
11
+
12
+ class LangchainClient:
13
+
14
+ def __init__(self):
15
+ self.llm = ChatOpenAI(
16
+ openai_api_key=OPENAI_API_KEY,
17
+ temperature=0,
18
+ model_name='gpt-4o'
19
+ )
20
+ self.store = {}
21
+
22
+ def create_prompt(self):
23
+ template_prompt = """You are a chatbot that can answer questions in English and Bahasa Indonesia.
24
+ answer using language from user, if user use bahasa indonesia answer in bahasa indonesia.
25
+ if user language is english answer in english"""
26
+
27
+ prompt = ChatPromptTemplate.from_messages(
28
+ [
29
+ (
30
+ "system",
31
+ template_prompt,
32
+ ),
33
+ MessagesPlaceholder(variable_name="history"),
34
+ ("human", "{question}"),
35
+ ]
36
+ )
37
+
38
+ return prompt
39
+
40
+ def get_session_history(self, session_id: str) -> BaseChatMessageHistory:
41
+ if session_id not in self.store:
42
+ self.store[session_id] = ChatMessageHistory()
43
+ return self.store[session_id]
44
+
45
+ def create_model(self):
46
+ prompt = self.create_prompt()
47
+ parser = StrOutputParser()
48
+ conversation_chain = prompt | self.llm | parser
49
+ conversation_chain_history = RunnableWithMessageHistory(
50
+ conversation_chain,
51
+ self.get_session_history,
52
+ input_messages_key="question",
53
+ history_messages_key="history",
54
+ )
55
+ return conversation_chain_history
56
+
57
+ def invoke_llm(self, model, text):
58
+ response = model.invoke(
59
+ {"question": text},
60
+ config={"configurable": {"session_id": "default"}}
61
+ )
62
+ return response
microphone.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import queue
2
+ import pyaudio
3
+
4
+ # Audio recording parameters
5
+ RATE = 16000
6
+ CHUNK = int(RATE / 10) # 100ms
7
+
8
+ class MicrophoneStream:
9
+ """Opens a recording stream as a generator yielding the audio chunks."""
10
+
11
+ def __init__(self: object, rate: int = RATE, chunk: int = CHUNK) -> None:
12
+ """The audio -- and generator -- is guaranteed to be on the main thread."""
13
+ self._rate = rate
14
+ self._chunk = chunk
15
+
16
+ # Create a thread-safe buffer of audio data
17
+ self._buff = queue.Queue()
18
+ self.closed = True
19
+
20
+ def __enter__(self: object) -> object:
21
+ self._audio_interface = pyaudio.PyAudio()
22
+ self._audio_stream = self._audio_interface.open(
23
+ format=pyaudio.paInt16,
24
+ # The API currently only supports 1-channel (mono) audio
25
+ # https://goo.gl/z757pE
26
+ channels=1,
27
+ rate=self._rate,
28
+ input=True,
29
+ frames_per_buffer=self._chunk,
30
+ # Run the audio stream asynchronously to fill the buffer object.
31
+ # This is necessary so that the input device's buffer doesn't
32
+ # overflow while the calling thread makes network requests, etc.
33
+ stream_callback=self._fill_buffer,
34
+ input_device_index=1
35
+ )
36
+
37
+ self.closed = False
38
+
39
+ return self
40
+
41
+ def __exit__(
42
+ self: object,
43
+ type: object,
44
+ value: object,
45
+ traceback: object,
46
+ ) -> None:
47
+ """Closes the stream, regardless of whether the connection was lost or not."""
48
+ self._audio_stream.stop_stream()
49
+ self._audio_stream.close()
50
+ self.closed = True
51
+ # Signal the generator to terminate so that the client's
52
+ # streaming_recognize method will not block the process termination.
53
+ self._buff.put(None)
54
+ self._audio_interface.terminate()
55
+
56
+ def _fill_buffer(
57
+ self: object,
58
+ in_data: object,
59
+ frame_count: int,
60
+ time_info: object,
61
+ status_flags: object,
62
+ ) -> object:
63
+ """Continuously collect data from the audio stream, into the buffer.
64
+
65
+ Args:
66
+ in_data: The audio data as a bytes object
67
+ frame_count: The number of frames captured
68
+ time_info: The time information
69
+ status_flags: The status flags
70
+
71
+ Returns:
72
+ The audio data as a bytes object
73
+ """
74
+ self._buff.put(in_data)
75
+ return None, pyaudio.paContinue
76
+
77
+ def generator(self: object) -> object:
78
+ """Generates audio chunks from the stream of audio data in chunks.
79
+
80
+ Args:
81
+ self: The MicrophoneStream object
82
+
83
+ Returns:
84
+ A generator that outputs audio chunks.
85
+ """
86
+ while not self.closed:
87
+ # Use a blocking get() to ensure there's at least one chunk of
88
+ # data, and stop iteration if the chunk is None, indicating the
89
+ # end of the audio stream.
90
+ chunk = self._buff.get()
91
+ if chunk is None:
92
+ return
93
+ data = [chunk]
94
+
95
+ # Now consume whatever other data's still buffered.
96
+ while True:
97
+ try:
98
+ chunk = self._buff.get(block=False)
99
+ if chunk is None:
100
+ return
101
+ data.append(chunk)
102
+ except queue.Empty:
103
+ break
104
+
105
+ yield b"".join(data)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ asyncio
3
+ google-cloud-speech
4
+ google-cloud-texttospeech
5
+ PyAudio
6
+ simpleaudio
7
+ langchain_openai
8
+ langchain
9
+ langchain_community
tts.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import google.cloud.texttospeech as tts
4
+ import simpleaudio as sa
5
+
6
+ class TextToSpeech:
7
+ def __init__(self):
8
+ self.voice_params = tts.VoiceSelectionParams(
9
+ language_code="id-ID", name="id-ID-Standard-A"
10
+ )
11
+ self.audio_config = tts.AudioConfig(audio_encoding=tts.AudioEncoding.LINEAR16, speaking_rate=1.25)
12
+ self.client = tts.TextToSpeechClient()
13
+
14
+ def text_to_speech(self, text: str):
15
+
16
+ start = time.time()
17
+ text_input = tts.SynthesisInput(text=text)
18
+ response = self.client.synthesize_speech(
19
+ input=text_input,
20
+ voice=self.voice_params,
21
+ audio_config=self.audio_config,
22
+ )
23
+ end = time.time()
24
+ print(f"Time taken to synthesize speech: {end-start:.2f}s")
25
+
26
+ play_obj = sa.play_buffer(response.audio_content, num_channels=1, bytes_per_sample=2, sample_rate=24000)
27
+ play_obj.wait_done()
utils.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ from langchain_client import LangchainClient
4
+ from tts import TextToSpeech
5
+
6
+ langchain_client = LangchainClient()
7
+ conversation_chain_history = langchain_client.create_model()
8
+
9
+ tts_client = TextToSpeech()
10
+
11
+ async def listen_print_loop(responses: object) -> str:
12
+
13
+ num_chars_printed = 0
14
+ for response in responses:
15
+ if not response.results:
16
+ continue
17
+
18
+ result = response.results[0]
19
+ if not result.alternatives:
20
+ continue
21
+ transcript = result.alternatives[0].transcript
22
+
23
+ overwrite_chars = " " * (num_chars_printed - len(transcript))
24
+
25
+ if not result.is_final:
26
+ sys.stdout.write(transcript + overwrite_chars + "\r")
27
+ sys.stdout.flush()
28
+
29
+ num_chars_printed = len(transcript)
30
+
31
+ else:
32
+ print(transcript + overwrite_chars)
33
+ result = langchain_client.invoke_llm(conversation_chain_history, transcript)
34
+
35
+ return result, tts_client.text_to_speech(result)