init
Browse files- app.py +51 -0
- langchain_client.py +62 -0
- microphone.py +105 -0
- packages.txt +1 -0
- requirements.txt +9 -0
- tts.py +27 -0
- utils.py +35 -0
app.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from google.cloud import speech
|
4 |
+
from microphone import MicrophoneStream
|
5 |
+
from utils import listen_print_loop
|
6 |
+
|
7 |
+
|
8 |
+
# Audio recording parameters
|
9 |
+
RATE = 16000
|
10 |
+
CHUNK = int(RATE / 10) # 100ms
|
11 |
+
LANGUAGE = "id-ID"
|
12 |
+
|
13 |
+
transcribe_client = speech.SpeechClient()
|
14 |
+
config = speech.RecognitionConfig(
|
15 |
+
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
16 |
+
sample_rate_hertz=RATE,
|
17 |
+
language_code=LANGUAGE,
|
18 |
+
)
|
19 |
+
|
20 |
+
streaming_config = speech.StreamingRecognitionConfig(
|
21 |
+
config=config, interim_results=True
|
22 |
+
)
|
23 |
+
|
24 |
+
async def main(audio) -> None:
|
25 |
+
|
26 |
+
print("Streaming started ...")
|
27 |
+
|
28 |
+
with MicrophoneStream(RATE, CHUNK) as stream:
|
29 |
+
audio_generator = stream.generator()
|
30 |
+
requests = (
|
31 |
+
speech.StreamingRecognizeRequest(audio_content=content)
|
32 |
+
for content in audio_generator
|
33 |
+
)
|
34 |
+
|
35 |
+
responses = transcribe_client.streaming_recognize(streaming_config, requests)
|
36 |
+
|
37 |
+
return await listen_print_loop(responses)
|
38 |
+
|
39 |
+
demo = gr.Interface(
|
40 |
+
fn=main,
|
41 |
+
inputs=[
|
42 |
+
gr.Audio(sources="microphone", streaming=True, label="Input Speech")
|
43 |
+
],
|
44 |
+
outputs=[
|
45 |
+
gr.Textbox(label="Transcription"),
|
46 |
+
gr.Audio(label="Audio")
|
47 |
+
],
|
48 |
+
live=True)
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
demo.launch()
|
langchain_client.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from langchain_openai import ChatOpenAI
|
4 |
+
from langchain_core.output_parsers import StrOutputParser
|
5 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
6 |
+
from langchain_community.chat_message_histories import ChatMessageHistory
|
7 |
+
from langchain_core.chat_history import BaseChatMessageHistory
|
8 |
+
from langchain_core.runnables.history import RunnableWithMessageHistory
|
9 |
+
|
10 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
11 |
+
|
12 |
+
class LangchainClient:
|
13 |
+
|
14 |
+
def __init__(self):
|
15 |
+
self.llm = ChatOpenAI(
|
16 |
+
openai_api_key=OPENAI_API_KEY,
|
17 |
+
temperature=0,
|
18 |
+
model_name='gpt-4o'
|
19 |
+
)
|
20 |
+
self.store = {}
|
21 |
+
|
22 |
+
def create_prompt(self):
|
23 |
+
template_prompt = """You are a chatbot that can answer questions in English and Bahasa Indonesia.
|
24 |
+
answer using language from user, if user use bahasa indonesia answer in bahasa indonesia.
|
25 |
+
if user language is english answer in english"""
|
26 |
+
|
27 |
+
prompt = ChatPromptTemplate.from_messages(
|
28 |
+
[
|
29 |
+
(
|
30 |
+
"system",
|
31 |
+
template_prompt,
|
32 |
+
),
|
33 |
+
MessagesPlaceholder(variable_name="history"),
|
34 |
+
("human", "{question}"),
|
35 |
+
]
|
36 |
+
)
|
37 |
+
|
38 |
+
return prompt
|
39 |
+
|
40 |
+
def get_session_history(self, session_id: str) -> BaseChatMessageHistory:
|
41 |
+
if session_id not in self.store:
|
42 |
+
self.store[session_id] = ChatMessageHistory()
|
43 |
+
return self.store[session_id]
|
44 |
+
|
45 |
+
def create_model(self):
|
46 |
+
prompt = self.create_prompt()
|
47 |
+
parser = StrOutputParser()
|
48 |
+
conversation_chain = prompt | self.llm | parser
|
49 |
+
conversation_chain_history = RunnableWithMessageHistory(
|
50 |
+
conversation_chain,
|
51 |
+
self.get_session_history,
|
52 |
+
input_messages_key="question",
|
53 |
+
history_messages_key="history",
|
54 |
+
)
|
55 |
+
return conversation_chain_history
|
56 |
+
|
57 |
+
def invoke_llm(self, model, text):
|
58 |
+
response = model.invoke(
|
59 |
+
{"question": text},
|
60 |
+
config={"configurable": {"session_id": "default"}}
|
61 |
+
)
|
62 |
+
return response
|
microphone.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import queue
|
2 |
+
import pyaudio
|
3 |
+
|
4 |
+
# Audio recording parameters
|
5 |
+
RATE = 16000
|
6 |
+
CHUNK = int(RATE / 10) # 100ms
|
7 |
+
|
8 |
+
class MicrophoneStream:
|
9 |
+
"""Opens a recording stream as a generator yielding the audio chunks."""
|
10 |
+
|
11 |
+
def __init__(self: object, rate: int = RATE, chunk: int = CHUNK) -> None:
|
12 |
+
"""The audio -- and generator -- is guaranteed to be on the main thread."""
|
13 |
+
self._rate = rate
|
14 |
+
self._chunk = chunk
|
15 |
+
|
16 |
+
# Create a thread-safe buffer of audio data
|
17 |
+
self._buff = queue.Queue()
|
18 |
+
self.closed = True
|
19 |
+
|
20 |
+
def __enter__(self: object) -> object:
|
21 |
+
self._audio_interface = pyaudio.PyAudio()
|
22 |
+
self._audio_stream = self._audio_interface.open(
|
23 |
+
format=pyaudio.paInt16,
|
24 |
+
# The API currently only supports 1-channel (mono) audio
|
25 |
+
# https://goo.gl/z757pE
|
26 |
+
channels=1,
|
27 |
+
rate=self._rate,
|
28 |
+
input=True,
|
29 |
+
frames_per_buffer=self._chunk,
|
30 |
+
# Run the audio stream asynchronously to fill the buffer object.
|
31 |
+
# This is necessary so that the input device's buffer doesn't
|
32 |
+
# overflow while the calling thread makes network requests, etc.
|
33 |
+
stream_callback=self._fill_buffer,
|
34 |
+
input_device_index=1
|
35 |
+
)
|
36 |
+
|
37 |
+
self.closed = False
|
38 |
+
|
39 |
+
return self
|
40 |
+
|
41 |
+
def __exit__(
|
42 |
+
self: object,
|
43 |
+
type: object,
|
44 |
+
value: object,
|
45 |
+
traceback: object,
|
46 |
+
) -> None:
|
47 |
+
"""Closes the stream, regardless of whether the connection was lost or not."""
|
48 |
+
self._audio_stream.stop_stream()
|
49 |
+
self._audio_stream.close()
|
50 |
+
self.closed = True
|
51 |
+
# Signal the generator to terminate so that the client's
|
52 |
+
# streaming_recognize method will not block the process termination.
|
53 |
+
self._buff.put(None)
|
54 |
+
self._audio_interface.terminate()
|
55 |
+
|
56 |
+
def _fill_buffer(
|
57 |
+
self: object,
|
58 |
+
in_data: object,
|
59 |
+
frame_count: int,
|
60 |
+
time_info: object,
|
61 |
+
status_flags: object,
|
62 |
+
) -> object:
|
63 |
+
"""Continuously collect data from the audio stream, into the buffer.
|
64 |
+
|
65 |
+
Args:
|
66 |
+
in_data: The audio data as a bytes object
|
67 |
+
frame_count: The number of frames captured
|
68 |
+
time_info: The time information
|
69 |
+
status_flags: The status flags
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
The audio data as a bytes object
|
73 |
+
"""
|
74 |
+
self._buff.put(in_data)
|
75 |
+
return None, pyaudio.paContinue
|
76 |
+
|
77 |
+
def generator(self: object) -> object:
|
78 |
+
"""Generates audio chunks from the stream of audio data in chunks.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
self: The MicrophoneStream object
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
A generator that outputs audio chunks.
|
85 |
+
"""
|
86 |
+
while not self.closed:
|
87 |
+
# Use a blocking get() to ensure there's at least one chunk of
|
88 |
+
# data, and stop iteration if the chunk is None, indicating the
|
89 |
+
# end of the audio stream.
|
90 |
+
chunk = self._buff.get()
|
91 |
+
if chunk is None:
|
92 |
+
return
|
93 |
+
data = [chunk]
|
94 |
+
|
95 |
+
# Now consume whatever other data's still buffered.
|
96 |
+
while True:
|
97 |
+
try:
|
98 |
+
chunk = self._buff.get(block=False)
|
99 |
+
if chunk is None:
|
100 |
+
return
|
101 |
+
data.append(chunk)
|
102 |
+
except queue.Empty:
|
103 |
+
break
|
104 |
+
|
105 |
+
yield b"".join(data)
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
asyncio
|
3 |
+
google-cloud-speech
|
4 |
+
google-cloud-texttospeech
|
5 |
+
PyAudio
|
6 |
+
simpleaudio
|
7 |
+
langchain_openai
|
8 |
+
langchain
|
9 |
+
langchain_community
|
tts.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
import google.cloud.texttospeech as tts
|
4 |
+
import simpleaudio as sa
|
5 |
+
|
6 |
+
class TextToSpeech:
|
7 |
+
def __init__(self):
|
8 |
+
self.voice_params = tts.VoiceSelectionParams(
|
9 |
+
language_code="id-ID", name="id-ID-Standard-A"
|
10 |
+
)
|
11 |
+
self.audio_config = tts.AudioConfig(audio_encoding=tts.AudioEncoding.LINEAR16, speaking_rate=1.25)
|
12 |
+
self.client = tts.TextToSpeechClient()
|
13 |
+
|
14 |
+
def text_to_speech(self, text: str):
|
15 |
+
|
16 |
+
start = time.time()
|
17 |
+
text_input = tts.SynthesisInput(text=text)
|
18 |
+
response = self.client.synthesize_speech(
|
19 |
+
input=text_input,
|
20 |
+
voice=self.voice_params,
|
21 |
+
audio_config=self.audio_config,
|
22 |
+
)
|
23 |
+
end = time.time()
|
24 |
+
print(f"Time taken to synthesize speech: {end-start:.2f}s")
|
25 |
+
|
26 |
+
play_obj = sa.play_buffer(response.audio_content, num_channels=1, bytes_per_sample=2, sample_rate=24000)
|
27 |
+
play_obj.wait_done()
|
utils.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
from langchain_client import LangchainClient
|
4 |
+
from tts import TextToSpeech
|
5 |
+
|
6 |
+
langchain_client = LangchainClient()
|
7 |
+
conversation_chain_history = langchain_client.create_model()
|
8 |
+
|
9 |
+
tts_client = TextToSpeech()
|
10 |
+
|
11 |
+
async def listen_print_loop(responses: object) -> str:
|
12 |
+
|
13 |
+
num_chars_printed = 0
|
14 |
+
for response in responses:
|
15 |
+
if not response.results:
|
16 |
+
continue
|
17 |
+
|
18 |
+
result = response.results[0]
|
19 |
+
if not result.alternatives:
|
20 |
+
continue
|
21 |
+
transcript = result.alternatives[0].transcript
|
22 |
+
|
23 |
+
overwrite_chars = " " * (num_chars_printed - len(transcript))
|
24 |
+
|
25 |
+
if not result.is_final:
|
26 |
+
sys.stdout.write(transcript + overwrite_chars + "\r")
|
27 |
+
sys.stdout.flush()
|
28 |
+
|
29 |
+
num_chars_printed = len(transcript)
|
30 |
+
|
31 |
+
else:
|
32 |
+
print(transcript + overwrite_chars)
|
33 |
+
result = langchain_client.invoke_llm(conversation_chain_history, transcript)
|
34 |
+
|
35 |
+
return result, tts_client.text_to_speech(result)
|