sohojoe commited on
Commit
edce499
1 Parent(s): 4531ec5

add basic text to speech

Browse files
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import deque
2
+ import os
3
+ import threading
4
+ import time
5
+ import av
6
+ import numpy as np
7
+ import streamlit as st
8
+ from streamlit_webrtc import WebRtcMode, webrtc_streamer
9
+ import pydub
10
+ # import av
11
+ # import cv2
12
+ from sample_utils.turn import get_ice_servers
13
+ import json
14
+ from typing import List
15
+
16
+ from vosk import SetLogLevel, Model, KaldiRecognizer
17
+ SetLogLevel(-1) # mutes vosk verbosity
18
+
19
+ from dotenv import load_dotenv
20
+ load_dotenv()
21
+
22
+ system_one = {
23
+ "audio_bit_rate": 16000,
24
+ # "audio_bit_rate": 32000,
25
+ # "audio_bit_rate": 48000,
26
+ }
27
+
28
+
29
+ playing = st.checkbox("Playing", value=True)
30
+
31
+ def load_vosk (model='small'):
32
+ # load vosk model
33
+ # get path of current file
34
+ current_file_path = os.path.abspath(__file__)
35
+ current_directory = os.path.dirname(current_file_path)
36
+ _path = os.path.join(current_directory, 'models', 'vosk', model)
37
+ model_voice = Model(_path)
38
+ recognizer = KaldiRecognizer(model_voice, system_one['audio_bit_rate'])
39
+ return recognizer
40
+
41
+ vask = load_vosk()
42
+
43
+ def handle_audio_frame(frame):
44
+ # if self.vosk.AcceptWaveform(data):
45
+ pass
46
+
47
+
48
+ def do_work(data: bytearray) -> tuple[str, bool]:
49
+ text = ''
50
+ speaker_finished = False
51
+ if vask.AcceptWaveform(data):
52
+ result = vask.Result()
53
+ result_json = json.loads(result)
54
+ text = result_json['text']
55
+ speaker_finished = True
56
+ else:
57
+ result = vask.PartialResult()
58
+ result_json = json.loads(result)
59
+ text = result_json['partial']
60
+ return text, speaker_finished
61
+
62
+
63
+ frames_deque_lock = threading.Lock()
64
+ frames_deque: deque = deque([])
65
+
66
+ async def queued_audio_frames_callback(
67
+ frames: List[av.AudioFrame],
68
+ ) -> av.AudioFrame:
69
+ with frames_deque_lock:
70
+ frames_deque.extend(frames)
71
+
72
+ # create frames to be returned.
73
+ new_frames = []
74
+ for frame in frames:
75
+ input_array = frame.to_ndarray()
76
+ new_frame = av.AudioFrame.from_ndarray(
77
+ np.zeros(input_array.shape, dtype=input_array.dtype),
78
+ layout=frame.layout.name,
79
+ )
80
+ new_frame.sample_rate = frame.sample_rate
81
+ new_frames.append(new_frame)
82
+
83
+ # TODO: replace with the audio we want to send to the other side.
84
+
85
+ return new_frames
86
+
87
+ webrtc_ctx = webrtc_streamer(
88
+ key="charles",
89
+ desired_playing_state=playing,
90
+ # audio_receiver_size=4096,
91
+ # audio_frame_callback=process_audio,
92
+ queued_audio_frames_callback=queued_audio_frames_callback,
93
+ mode=WebRtcMode.SENDRECV,
94
+ rtc_configuration={"iceServers": get_ice_servers()},
95
+ async_processing=True,
96
+ )
97
+
98
+ system_one_audio_status = st.empty()
99
+
100
+ if not webrtc_ctx.state.playing:
101
+ exit
102
+
103
+ system_one_audio_status.write("Initializing...")
104
+ system_one_audio_output = st.empty()
105
+ system_one_audio_history = []
106
+ system_one_audio_history_output = st.empty()
107
+
108
+
109
+ sound_chunk = pydub.AudioSegment.empty()
110
+ while True:
111
+ if webrtc_ctx.state.playing:
112
+ audio_frames = []
113
+ with frames_deque_lock:
114
+ while len(frames_deque) > 0:
115
+ frame = frames_deque.popleft()
116
+ audio_frames.append(frame)
117
+
118
+ if len(audio_frames) == 0:
119
+ time.sleep(0.1)
120
+ system_one_audio_status.write("No frame arrived.")
121
+ continue
122
+
123
+ system_one_audio_status.write("Running. Say something!")
124
+
125
+ for audio_frame in audio_frames:
126
+ sound = pydub.AudioSegment(
127
+ data=audio_frame.to_ndarray().tobytes(),
128
+ sample_width=audio_frame.format.bytes,
129
+ frame_rate=audio_frame.sample_rate,
130
+ channels=len(audio_frame.layout.channels),
131
+ )
132
+ sound = sound.set_channels(1)
133
+ sound = sound.set_frame_rate(system_one['audio_bit_rate'])
134
+ sound_chunk += sound
135
+
136
+ if len(sound_chunk) > 0:
137
+ buffer = np.array(sound_chunk.get_array_of_samples())
138
+ text, speaker_finished = do_work(buffer.tobytes())
139
+ system_one_audio_output.markdown(f"**System 1 Audio:** {text}")
140
+ if speaker_finished and len(text) > 0:
141
+ system_one_audio_history.append(text)
142
+ if len(system_one_audio_history) > 10:
143
+ system_one_audio_history = system_one_audio_history[-10:]
144
+ table_content = "| System 1 Audio History |\n| --- |\n"
145
+ table_content += "\n".join([f"| {item} |" for item in reversed(system_one_audio_history)])
146
+ system_one_audio_history_output.markdown(table_content)
147
+ sound_chunk = pydub.AudioSegment.empty()
148
+
149
+ else:
150
+ system_one_audio_status.write("Stopped.")
151
+ break
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ vosk
2
+ # scipy
3
+ # sounddevice
4
+ # setuptools-rust
5
+ # git+https://github.com/openai/whisper.git
6
+
7
+ opencv-python-headless
8
+ pydub
9
+ streamlit_webrtc
10
+ twilio
11
+ python-dotenv
12
+ watchdog
13
+ pydub
sample_utils/__init__.py ADDED
File without changes
sample_utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (168 Bytes). View file
 
sample_utils/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (166 Bytes). View file
 
sample_utils/__pycache__/download.cpython-310.pyc ADDED
Binary file (1.32 kB). View file
 
sample_utils/__pycache__/turn.cpython-310.pyc ADDED
Binary file (1.29 kB). View file
 
sample_utils/__pycache__/turn.cpython-39.pyc ADDED
Binary file (1.29 kB). View file
 
sample_utils/download.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.request
2
+ from pathlib import Path
3
+
4
+ import streamlit as st
5
+
6
+
7
+ # This code is based on https://github.com/streamlit/demo-self-driving/blob/230245391f2dda0cb464008195a470751c01770b/streamlit_app.py#L48 # noqa: E501
8
+ def download_file(url, download_to: Path, expected_size=None):
9
+ # Don't download the file twice.
10
+ # (If possible, verify the download using the file length.)
11
+ if download_to.exists():
12
+ if expected_size:
13
+ if download_to.stat().st_size == expected_size:
14
+ return
15
+ else:
16
+ st.info(f"{url} is already downloaded.")
17
+ if not st.button("Download again?"):
18
+ return
19
+
20
+ download_to.parent.mkdir(parents=True, exist_ok=True)
21
+
22
+ # These are handles to two visual elements to animate.
23
+ weights_warning, progress_bar = None, None
24
+ try:
25
+ weights_warning = st.warning("Downloading %s..." % url)
26
+ progress_bar = st.progress(0)
27
+ with open(download_to, "wb") as output_file:
28
+ with urllib.request.urlopen(url) as response:
29
+ length = int(response.info()["Content-Length"])
30
+ counter = 0.0
31
+ MEGABYTES = 2.0 ** 20.0
32
+ while True:
33
+ data = response.read(8192)
34
+ if not data:
35
+ break
36
+ counter += len(data)
37
+ output_file.write(data)
38
+
39
+ # We perform animation by overwriting the elements.
40
+ weights_warning.warning(
41
+ "Downloading %s... (%6.2f/%6.2f MB)"
42
+ % (url, counter / MEGABYTES, length / MEGABYTES)
43
+ )
44
+ progress_bar.progress(min(counter / length, 1.0))
45
+ # Finally, we remove these visual elements by calling .empty().
46
+ finally:
47
+ if weights_warning is not None:
48
+ weights_warning.empty()
49
+ if progress_bar is not None:
50
+ progress_bar.empty()
sample_utils/turn.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ import streamlit as st
5
+ from twilio.rest import Client
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ @st.cache_data
10
+ def get_ice_servers():
11
+ """Use Twilio's TURN server because Streamlit Community Cloud has changed
12
+ its infrastructure and WebRTC connection cannot be established without TURN server now. # noqa: E501
13
+ We considered Open Relay Project (https://www.metered.ca/tools/openrelay/) too,
14
+ but it is not stable and hardly works as some people reported like https://github.com/aiortc/aiortc/issues/832#issuecomment-1482420656 # noqa: E501
15
+ See https://github.com/whitphx/streamlit-webrtc/issues/1213
16
+ """
17
+
18
+ # Ref: https://www.twilio.com/docs/stun-turn/api
19
+ try:
20
+ account_sid = os.environ["TWILIO_ACCOUNT_SID"]
21
+ auth_token = os.environ["TWILIO_AUTH_TOKEN"]
22
+ except KeyError:
23
+ logger.warning(
24
+ "Twilio credentials are not set. Fallback to a free STUN server from Google." # noqa: E501
25
+ )
26
+ return [{"urls": ["stun:stun.l.google.com:19302"]}]
27
+
28
+ client = Client(account_sid, auth_token)
29
+
30
+ token = client.tokens.create()
31
+
32
+ return token.ice_servers