Spaces:

Adipta
/

voice-ai

Sleeping

App Files Files Community

Adipta commited on Jul 10, 2024

Commit

27c3220

verified ·

1 Parent(s): b62e489

init

Browse files

Files changed (7) hide show

app.py +51 -0
langchain_client.py +62 -0
microphone.py +105 -0
packages.txt +1 -0
requirements.txt +9 -0
tts.py +27 -0
utils.py +35 -0

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import gradio as gr
+from google.cloud import speech
+from microphone import MicrophoneStream
+from utils import listen_print_loop
+# Audio recording parameters
+RATE = 16000
+CHUNK = int(RATE / 10)  # 100ms
+LANGUAGE = "id-ID"
+transcribe_client = speech.SpeechClient()
+config = speech.RecognitionConfig(
+    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
+    sample_rate_hertz=RATE,
+    language_code=LANGUAGE,
+)
+streaming_config = speech.StreamingRecognitionConfig(
+    config=config, interim_results=True
+)
+async def main(audio) -> None:
+    print("Streaming started ...")
+    with MicrophoneStream(RATE, CHUNK) as stream:
+        audio_generator = stream.generator()
+        requests = (
+            speech.StreamingRecognizeRequest(audio_content=content)
+            for content in audio_generator
+        )
+        responses = transcribe_client.streaming_recognize(streaming_config, requests)
+        return await listen_print_loop(responses)
+demo = gr.Interface(
+    fn=main,
+    inputs=[
+        gr.Audio(sources="microphone", streaming=True, label="Input Speech")
+    ],
+    outputs=[
+        gr.Textbox(label="Transcription"),
+        gr.Audio(label="Audio")
+    ],
+    live=True)
+if __name__ == "__main__":
+    demo.launch()

langchain_client.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain_core.chat_history import BaseChatMessageHistory
+from langchain_core.runnables.history import RunnableWithMessageHistory
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+class LangchainClient:
+    def __init__(self):
+        self.llm = ChatOpenAI(
+            openai_api_key=OPENAI_API_KEY,
+            temperature=0,
+            model_name='gpt-4o'
+            )
+        self.store = {}
+    def create_prompt(self):
+        template_prompt = """You are a chatbot that can answer questions in English and Bahasa Indonesia.
+                        answer using language from user, if user use bahasa indonesia answer in bahasa indonesia.
+                        if user language is english answer in english"""
+        prompt = ChatPromptTemplate.from_messages(
+            [
+                (
+                    "system",
+                    template_prompt,
+                ),
+                MessagesPlaceholder(variable_name="history"),
+                ("human", "{question}"),
+            ]
+        )
+        return prompt
+    def get_session_history(self, session_id: str) -> BaseChatMessageHistory:
+        if session_id not in self.store:
+            self.store[session_id] = ChatMessageHistory()
+        return self.store[session_id]
+    def create_model(self):
+        prompt = self.create_prompt()
+        parser = StrOutputParser()
+        conversation_chain = prompt | self.llm | parser
+        conversation_chain_history = RunnableWithMessageHistory(
+            conversation_chain,
+            self.get_session_history,
+            input_messages_key="question",
+            history_messages_key="history",
+        )
+        return conversation_chain_history
+    def invoke_llm(self, model, text):
+        response = model.invoke(
+            {"question": text},
+            config={"configurable": {"session_id": "default"}}
+            )
+        return response

microphone.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import queue
+import pyaudio
+# Audio recording parameters
+RATE = 16000
+CHUNK = int(RATE / 10)  # 100ms
+class MicrophoneStream:
+    """Opens a recording stream as a generator yielding the audio chunks."""
+    def __init__(self: object, rate: int = RATE, chunk: int = CHUNK) -> None:
+        """The audio -- and generator -- is guaranteed to be on the main thread."""
+        self._rate = rate
+        self._chunk = chunk
+        # Create a thread-safe buffer of audio data
+        self._buff = queue.Queue()
+        self.closed = True
+    def __enter__(self: object) -> object:
+        self._audio_interface = pyaudio.PyAudio()
+        self._audio_stream = self._audio_interface.open(
+            format=pyaudio.paInt16,
+            # The API currently only supports 1-channel (mono) audio
+            # https://goo.gl/z757pE
+            channels=1,
+            rate=self._rate,
+            input=True,
+            frames_per_buffer=self._chunk,
+            # Run the audio stream asynchronously to fill the buffer object.
+            # This is necessary so that the input device's buffer doesn't
+            # overflow while the calling thread makes network requests, etc.
+            stream_callback=self._fill_buffer,
+            input_device_index=1
+        )
+        self.closed = False
+        return self
+    def __exit__(
+        self: object,
+        type: object,
+        value: object,
+        traceback: object,
+    ) -> None:
+        """Closes the stream, regardless of whether the connection was lost or not."""
+        self._audio_stream.stop_stream()
+        self._audio_stream.close()
+        self.closed = True
+        # Signal the generator to terminate so that the client's
+        # streaming_recognize method will not block the process termination.
+        self._buff.put(None)
+        self._audio_interface.terminate()
+    def _fill_buffer(
+        self: object,
+        in_data: object,
+        frame_count: int,
+        time_info: object,
+        status_flags: object,
+    ) -> object:
+        """Continuously collect data from the audio stream, into the buffer.
+        Args:
+            in_data: The audio data as a bytes object
+            frame_count: The number of frames captured
+            time_info: The time information
+            status_flags: The status flags
+        Returns:
+            The audio data as a bytes object
+        """
+        self._buff.put(in_data)
+        return None, pyaudio.paContinue
+    def generator(self: object) -> object:
+        """Generates audio chunks from the stream of audio data in chunks.
+        Args:
+            self: The MicrophoneStream object
+        Returns:
+            A generator that outputs audio chunks.
+        """
+        while not self.closed:
+            # Use a blocking get() to ensure there's at least one chunk of
+            # data, and stop iteration if the chunk is None, indicating the
+            # end of the audio stream.
+            chunk = self._buff.get()
+            if chunk is None:
+                return
+            data = [chunk]
+            # Now consume whatever other data's still buffered.
+            while True:
+                try:
+                    chunk = self._buff.get(block=False)
+                    if chunk is None:
+                        return
+                    data.append(chunk)
+                except queue.Empty:
+                    break
+            yield b"".join(data)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio
+asyncio
+google-cloud-speech
+google-cloud-texttospeech
+PyAudio
+simpleaudio
+langchain_openai
+langchain
+langchain_community

tts.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import time
+import google.cloud.texttospeech as tts
+import simpleaudio as sa
+class TextToSpeech:
+    def __init__(self):
+        self.voice_params = tts.VoiceSelectionParams(
+            language_code="id-ID", name="id-ID-Standard-A"
+        )
+        self.audio_config = tts.AudioConfig(audio_encoding=tts.AudioEncoding.LINEAR16, speaking_rate=1.25)
+        self.client = tts.TextToSpeechClient()
+    def text_to_speech(self, text: str):
+        start = time.time()
+        text_input = tts.SynthesisInput(text=text)
+        response = self.client.synthesize_speech(
+            input=text_input,
+            voice=self.voice_params,
+            audio_config=self.audio_config,
+        )
+        end = time.time()
+        print(f"Time taken to synthesize speech: {end-start:.2f}s")
+        play_obj = sa.play_buffer(response.audio_content, num_channels=1, bytes_per_sample=2, sample_rate=24000)
+        play_obj.wait_done()

utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import sys
+from langchain_client import LangchainClient
+from tts import TextToSpeech
+langchain_client = LangchainClient()
+conversation_chain_history = langchain_client.create_model()
+tts_client = TextToSpeech()
+async def listen_print_loop(responses: object) -> str:
+    num_chars_printed = 0
+    for response in responses:
+        if not response.results:
+            continue
+        result = response.results[0]
+        if not result.alternatives:
+            continue
+        transcript = result.alternatives[0].transcript
+        overwrite_chars = " " * (num_chars_printed - len(transcript))
+        if not result.is_final:
+            sys.stdout.write(transcript + overwrite_chars + "\r")
+            sys.stdout.flush()
+            num_chars_printed = len(transcript)
+        else:
+            print(transcript + overwrite_chars)
+            result = langchain_client.invoke_llm(conversation_chain_history, transcript)
+            return result, tts_client.text_to_speech(result)