samarthsrivastava commited on
Commit
787a546
1 Parent(s): 074e7d9

Upload folder using huggingface_hub

Browse files
.gcloudignore ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file specifies files that are *not* uploaded to Google Cloud
2
+ # using gcloud. It follows the same syntax as .gitignore, with the addition of
3
+ # "#!include" directives (which insert the entries of the given .gitignore-style
4
+ # file at that point).
5
+ #
6
+ # For more information, run:
7
+ # $ gcloud topic gcloudignore
8
+ #
9
+ .gcloudignore
10
+ # If you would like to upload your .git directory, .gitignore file or files
11
+ # from your .gitignore file, remove the corresponding line
12
+ # below:
13
+ .git
14
+ .gitignore
15
+
16
+ # Python pycache:
17
+ __pycache__/
18
+ # Ignored by the build system
19
+ /setup.cfg
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ recasepunc/checkpoint filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/voice_to_text_systemdev -checkpoint-checkpoint-checkpoint.ipynb ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 29,
6
+ "id": "5c7d8fe6-69ca-4f29-9046-0b0bc9f31911",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "data": {
11
+ "application/vnd.jupyter.widget-view+json": {
12
+ "model_id": "99ee6b03c5154644998c23c837444e83",
13
+ "version_major": 2,
14
+ "version_minor": 0
15
+ },
16
+ "text/plain": [
17
+ "HBox(children=(Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle()), B…"
18
+ ]
19
+ },
20
+ "metadata": {},
21
+ "output_type": "display_data"
22
+ },
23
+ {
24
+ "data": {
25
+ "application/vnd.jupyter.widget-view+json": {
26
+ "model_id": "2b3e4f24da8d4c198b5d15f0f3f7399d",
27
+ "version_major": 2,
28
+ "version_minor": 0
29
+ },
30
+ "text/plain": [
31
+ "Output()"
32
+ ]
33
+ },
34
+ "metadata": {},
35
+ "output_type": "display_data"
36
+ }
37
+ ],
38
+ "source": [
39
+ "import ipywidgets as widgets\n",
40
+ "from IPython.display import display, clear_output\n",
41
+ "from threading import Thread\n",
42
+ "from queue import Queue\n",
43
+ "import time\n",
44
+ "\n",
45
+ "messages = Queue()\n",
46
+ "recordings = Queue()\n",
47
+ "\n",
48
+ "record_button = widgets.Button(\n",
49
+ " description=\"Record\",\n",
50
+ " disabled=False,\n",
51
+ " button_style=\"success\",\n",
52
+ " icon=\"microphone\"\n",
53
+ ")\n",
54
+ "\n",
55
+ "stop_button = widgets.Button(\n",
56
+ " description=\"Stop\",\n",
57
+ " disabled=False,\n",
58
+ " button_style=\"warning\",\n",
59
+ " icon=\"stop\"\n",
60
+ ")\n",
61
+ "\n",
62
+ "output = widgets.Output()\n",
63
+ "\n",
64
+ "def record_microphone():\n",
65
+ " while not messages.empty():\n",
66
+ " time.sleep(1) # Simulate recording\n",
67
+ " recordings.put(\"Audio recorded.\") # Simulated recorded audio data\n",
68
+ "\n",
69
+ "def speech_recognition(output_widget):\n",
70
+ " while not messages.empty():\n",
71
+ " time.sleep(2) # Simulate transcription\n",
72
+ " with output_widget:\n",
73
+ " clear_output(wait=True)\n",
74
+ " display(\"Transcription: Hello, how are you?\") # Simulated transcription result\n",
75
+ "\n",
76
+ "def start_recording(data):\n",
77
+ " if not messages.empty():\n",
78
+ " return # Recording already in progress\n",
79
+ "\n",
80
+ " messages.put(True)\n",
81
+ " with output:\n",
82
+ " clear_output(wait=True)\n",
83
+ " display(\"Starting...\")\n",
84
+ "\n",
85
+ " record = Thread(target=record_microphone)\n",
86
+ " record.start()\n",
87
+ "\n",
88
+ " transcribe = Thread(target=speech_recognition, args=(output,))\n",
89
+ " transcribe.start()\n",
90
+ "\n",
91
+ "def stop_recording(data):\n",
92
+ " if messages.empty():\n",
93
+ " return # No recording in progress\n",
94
+ "\n",
95
+ " messages.get()\n",
96
+ " with output:\n",
97
+ " clear_output(wait=True)\n",
98
+ " display(\"Stopped.\")\n",
99
+ "\n",
100
+ "record_button.on_click(start_recording)\n",
101
+ "stop_button.on_click(stop_recording)\n",
102
+ "\n",
103
+ "display(widgets.HBox([record_button, stop_button]), output)\n"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 30,
109
+ "id": "bdcb9097-ab31-4dcc-9e2a-4e0818fceb3f",
110
+ "metadata": {},
111
+ "outputs": [
112
+ {
113
+ "name": "stdout",
114
+ "output_type": "stream",
115
+ "text": [
116
+ "Requirement already satisfied: pyaudio in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (0.2.14)\n"
117
+ ]
118
+ }
119
+ ],
120
+ "source": [
121
+ "!python -m pip install pyaudio"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 31,
127
+ "id": "34112777-1845-4aff-80de-099ceed52f01",
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "name": "stdout",
132
+ "output_type": "stream",
133
+ "text": [
134
+ "{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
135
+ "{'index': 1, 'structVersion': 2, 'name': 'Microphone Array (Intel® Smart ', 'hostApi': 0, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
136
+ "{'index': 2, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Output', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
137
+ "{'index': 3, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
138
+ "{'index': 4, 'structVersion': 2, 'name': 'Primary Sound Capture Driver', 'hostApi': 1, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.12, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.24, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 44100.0}\n",
139
+ "{'index': 5, 'structVersion': 2, 'name': 'Microphone Array (Intel® Smart Sound Technology (Intel® SST))', 'hostApi': 1, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.12, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.24, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 44100.0}\n",
140
+ "{'index': 6, 'structVersion': 2, 'name': 'Primary Sound Driver', 'hostApi': 1, 'maxInputChannels': 0, 'maxOutputChannels': 8, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.12, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.24, 'defaultSampleRate': 44100.0}\n",
141
+ "{'index': 7, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 1, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.12, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.24, 'defaultSampleRate': 44100.0}\n",
142
+ "{'index': 8, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 2, 'maxInputChannels': 0, 'maxOutputChannels': 8, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.003, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.01, 'defaultSampleRate': 48000.0}\n",
143
+ "{'index': 9, 'structVersion': 2, 'name': 'Microphone Array (Intel® Smart Sound Technology (Intel® SST))', 'hostApi': 2, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.002, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.01, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 48000.0}\n",
144
+ "{'index': 10, 'structVersion': 2, 'name': 'Microphone Array 1 (Intel® Smart Sound Technology (Intel® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
145
+ "{'index': 11, 'structVersion': 2, 'name': 'Microphone Array 2 (Intel® Smart Sound Technology (Intel® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 16000.0}\n",
146
+ "{'index': 12, 'structVersion': 2, 'name': 'Microphone Array 3 (Intel® Smart Sound Technology (Intel® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 16000.0}\n",
147
+ "{'index': 13, 'structVersion': 2, 'name': 'Stereo Mix (Realtek HD Audio Stereo input)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
148
+ "{'index': 14, 'structVersion': 2, 'name': 'Headphones (Realtek HD Audio 2nd output with SST)', 'hostApi': 3, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
149
+ "{'index': 15, 'structVersion': 2, 'name': 'Speakers (Realtek HD Audio output with SST)', 'hostApi': 3, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
150
+ "{'index': 16, 'structVersion': 2, 'name': 'Microphone (Realtek HD Audio Mic input)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 44100.0}\n"
151
+ ]
152
+ }
153
+ ],
154
+ "source": [
155
+ "import pyaudio\n",
156
+ "\n",
157
+ "p = pyaudio.PyAudio()\n",
158
+ "for i in range(p.get_device_count()):\n",
159
+ " print(p.get_device_info_by_index(i))\n",
160
+ "\n",
161
+ "p.terminate()"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 32,
167
+ "id": "2e74dacf-1a91-4dfa-bf91-c64c72755d75",
168
+ "metadata": {},
169
+ "outputs": [],
170
+ "source": [
171
+ "import pyaudio\n",
172
+ "from queue import Queue\n",
173
+ "\n",
174
+ "CHANNELS = 1\n",
175
+ "FRAME_RATE = 16000\n",
176
+ "RECORD_SECONDS = 20\n",
177
+ "AUDIO_FORMAT = pyaudio.paInt16\n",
178
+ "SAMPLE_SIZE = 2\n",
179
+ "\n",
180
+ "messages = Queue()\n",
181
+ "recordings = Queue()\n",
182
+ "\n",
183
+ "def record_microphone(chunk=1024):\n",
184
+ " p = pyaudio.PyAudio()\n",
185
+ "\n",
186
+ " stream = p.open(format=AUDIO_FORMAT,\n",
187
+ " channels=CHANNELS,\n",
188
+ " rate=FRAME_RATE,\n",
189
+ " input=True,\n",
190
+ " input_device_index=1,\n",
191
+ " frames_per_buffer=chunk)\n",
192
+ "\n",
193
+ " frames = []\n",
194
+ "\n",
195
+ " while not messages.empty():\n",
196
+ " data = stream.read(chunk)\n",
197
+ " frames.append(data)\n",
198
+ "\n",
199
+ " if len(frames) >= int(FRAME_RATE * RECORD_SECONDS / chunk):\n",
200
+ " recordings.put(frames.copy())\n",
201
+ " frames = []\n",
202
+ "\n",
203
+ " stream.stop_stream()\n",
204
+ " stream.close()\n",
205
+ " p.terminate()\n"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": 33,
211
+ "id": "931dc754-e034-45e7-981b-a9210c1fe6e9",
212
+ "metadata": {},
213
+ "outputs": [],
214
+ "source": [
215
+ "import subprocess\n",
216
+ "import json\n",
217
+ "from vosk import Model, KaldiRecognizer\n",
218
+ "\n",
219
+ "model = Model(model_name=\"vosk-model-en-us-0.42-gigaspeech\")\n",
220
+ "rec = KaldiRecognizer(model, FRAME_RATE)\n",
221
+ "rec.SetWords(True)\n",
222
+ "\n",
223
+ "def speech_recognition(output):\n",
224
+ " while not messages.empty():\n",
225
+ " frames = recordings.get()\n",
226
+ "\n",
227
+ " rec.AcceptWaveform(b''.join(frames))\n",
228
+ " result = rec.Result()\n",
229
+ " text = json.loads(result)[\"text\"]\n",
230
+ "\n",
231
+ " cased = subprocess.check_output(\"python recasepunc/recasepunc.py predict recasepunc/checkpoint\", shell=True, text=True, input=text)\n",
232
+ " output.append_stdout(cased)"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": null,
238
+ "id": "a27fb138-d3a9-4e04-83fe-23aca2921d92",
239
+ "metadata": {},
240
+ "outputs": [],
241
+ "source": []
242
+ }
243
+ ],
244
+ "metadata": {
245
+ "kernelspec": {
246
+ "display_name": "Python 3 (ipykernel)",
247
+ "language": "python",
248
+ "name": "python3"
249
+ },
250
+ "language_info": {
251
+ "codemirror_mode": {
252
+ "name": "ipython",
253
+ "version": 3
254
+ },
255
+ "file_extension": ".py",
256
+ "mimetype": "text/x-python",
257
+ "name": "python",
258
+ "nbconvert_exporter": "python",
259
+ "pygments_lexer": "ipython3",
260
+ "version": "3.12.4"
261
+ }
262
+ },
263
+ "nbformat": 4,
264
+ "nbformat_minor": 5
265
+ }
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Voice To Text System
3
- emoji: 🌍
4
- colorFrom: blue
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 4.36.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: voice_to_text_system
3
+ app_file: voice_to_text_systemdev -checkpoint-checkpoint.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.36.1
 
 
6
  ---
 
 
app.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ runtime: python38
2
+
3
+ entrypoint: gunicorn -b :$PORT main:app
4
+
5
+ handlers:
6
+ - url: /.*
7
+ script: auto
main.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from flask import Flask, request, jsonify
3
+
4
+ app = Flask(__name__)
5
+
6
+ @app.route('/')
7
+ def index():
8
+ return 'Hello, World!'
9
+
10
+ @app.route('/input', methods=['POST'])
11
+ def get_input():
12
+ data = request.json
13
+ # Process the input data as needed
14
+ return jsonify(data)
15
+
16
+ if __name__ == '__main__':
17
+ port = int(os.environ.get('PORT', 8080))
18
+ app.run(host='0.0.0.0', port=port)
recasepunc/README ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ 1. Install pytorch and transformers:
2
+
3
+ pip3 install transformers
4
+
5
+ 2. Run python3 example.py de-test.txt
6
+
7
+ 3. Compare with de-test.txt.orig
recasepunc/checkpoint ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9782ccd13a130feffb13609834778421ebd39e26910d25ddcf2185a0eea75935
3
+ size 1310193349
recasepunc/example.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+ from transformers import logging
4
+ from recasepunc import CasePuncPredictor
5
+ from recasepunc import WordpieceTokenizer
6
+ from recasepunc import Config
7
+
8
+ logging.set_verbosity_error()
9
+
10
+ predictor = CasePuncPredictor('checkpoint', lang="en")
11
+
12
+ text = " ".join(open(sys.argv[1]).readlines())
13
+ tokens = list(enumerate(predictor.tokenize(text)))
14
+
15
+ results = ""
16
+ for token, case_label, punc_label in predictor.predict(tokens, lambda x: x[1]):
17
+ prediction = predictor.map_punc_label(predictor.map_case_label(token[1], case_label), punc_label)
18
+
19
+ if token[1][0] == '\'' or (len(results) > 0 and results[-1] == '\''):
20
+ results = results + prediction
21
+ elif token[1][0] != '#':
22
+ results = results + ' ' + prediction
23
+ else:
24
+ results = results + prediction
25
+
26
+ print (results.strip())
recasepunc/recasepunc.py ADDED
@@ -0,0 +1,744 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import collections
3
+ import os
4
+ import regex as re
5
+ import re
6
+ #from mosestokenizer import *
7
+ from tqdm import tqdm
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+ import torch.optim as optim
12
+ import random
13
+ import unicodedata
14
+ import numpy as np
15
+ import argparse
16
+ from torch.utils.data import TensorDataset, DataLoader
17
+
18
+ from transformers import AutoModel, AutoTokenizer, BertTokenizer
19
+
20
+
21
+ default_config = argparse.Namespace(
22
+ seed=871253,
23
+ lang='en',
24
+ #flavor='flaubert/flaubert_base_uncased',
25
+ flavor=None,
26
+ max_length=256,
27
+ batch_size=16,
28
+ updates=24000,
29
+ period=1000,
30
+ lr=1e-5,
31
+ dab_rate=0.1,
32
+ device='cuda',
33
+ debug=False
34
+ )
35
+
36
+ default_flavors = {
37
+ 'fr': 'flaubert/flaubert_base_uncased',
38
+ 'en': 'bert-base-uncased',
39
+ 'zh': 'ckiplab/bert-base-chinese',
40
+ 'tr': 'dbmdz/bert-base-turkish-uncased',
41
+ 'de': 'dbmdz/bert-base-german-uncased',
42
+ 'pt': 'neuralmind/bert-base-portuguese-cased'
43
+ }
44
+
45
+ class Config(argparse.Namespace):
46
+ def __init__(self, **kwargs):
47
+ for key, value in default_config.__dict__.items():
48
+ setattr(self, key, value)
49
+ for key, value in kwargs.items():
50
+ setattr(self, key, value)
51
+
52
+ assert self.lang in ['fr', 'en', 'zh', 'tr', 'pt', 'de']
53
+
54
+ if 'lang' in kwargs and ('flavor' not in kwargs or kwargs['flavor'] is None):
55
+ self.flavor = default_flavors[self.lang]
56
+
57
+ #print(self.lang, self.flavor)
58
+
59
+
60
+ def init_random(seed):
61
+ # make sure everything is deterministic
62
+ os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
63
+ #torch.use_deterministic_algorithms(True)
64
+ torch.manual_seed(seed)
65
+ torch.cuda.manual_seed_all(seed)
66
+ random.seed(seed)
67
+ np.random.seed(seed)
68
+
69
+ # NOTE: it is assumed in the implementation that y[:,0] is the punctuation label, and y[:,1] is the case label!
70
+
71
+ punctuation = {
72
+ 'O': 0,
73
+ 'COMMA': 1,
74
+ 'PERIOD': 2,
75
+ 'QUESTION': 3,
76
+ 'EXCLAMATION': 4,
77
+ }
78
+
79
+ punctuation_syms = ['', ',', '.', ' ?', ' !']
80
+
81
+ case = {
82
+ 'LOWER': 0,
83
+ 'UPPER': 1,
84
+ 'CAPITALIZE': 2,
85
+ 'OTHER': 3,
86
+ }
87
+
88
+
89
+ class Model(nn.Module):
90
+ def __init__(self, flavor, device):
91
+ super().__init__()
92
+ self.bert = AutoModel.from_pretrained(flavor)
93
+ # need a proper way of determining representation size
94
+ size = self.bert.dim if hasattr(self.bert, 'dim') else self.bert.config.pooler_fc_size if hasattr(self.bert.config, 'pooler_fc_size') else self.bert.config.emb_dim if hasattr(self.bert.config, 'emb_dim') else self.bert.config.hidden_size
95
+ self.punc = nn.Linear(size, 5)
96
+ self.case = nn.Linear(size, 4)
97
+ self.dropout = nn.Dropout(0.3)
98
+ self.to(device)
99
+
100
+ def forward(self, x):
101
+ output = self.bert(x)
102
+ representations = self.dropout(F.gelu(output['last_hidden_state']))
103
+ punc = self.punc(representations)
104
+ case = self.case(representations)
105
+ return punc, case
106
+
107
+
108
+ # randomly create sequences that align to punctuation boundaries
109
+ def drop_at_boundaries(rate, x, y, cls_token_id, sep_token_id, pad_token_id):
110
+ for i, dropped in enumerate(torch.rand((len(x),)) < rate):
111
+ if dropped:
112
+ # select all indices that are sentence endings
113
+ indices = (y[i,:,0] > 1).nonzero(as_tuple=True)[0]
114
+ if len(indices) < 2:
115
+ continue
116
+ start = indices[0] + 1
117
+ end = indices[random.randint(1, len(indices) - 1)] + 1
118
+ length = end - start
119
+ if length + 2 > len(x[i]):
120
+ continue
121
+ x[i, 0] = cls_token_id
122
+ x[i, 1: length + 1] = x[i, start: end].clone()
123
+ x[i, length + 1] = sep_token_id
124
+ x[i, length + 2:] = pad_token_id
125
+ y[i, 0] = 0
126
+ y[i, 1: length + 1] = y[i, start: end].clone()
127
+ y[i, length + 1:] = 0
128
+
129
+
130
+ def compute_performance(config, model, loader):
131
+ device = config.device
132
+ criterion = nn.CrossEntropyLoss()
133
+ model.eval()
134
+ total_loss = all_correct1 = all_correct2 = num_loss = num_perf = 0
135
+ num_ref = collections.defaultdict(float)
136
+ num_hyp = collections.defaultdict(float)
137
+ num_correct = collections.defaultdict(float)
138
+ for x, y in loader:
139
+ x = x.long().to(device)
140
+ y = y.long().to(device)
141
+ y1 = y[:,:,0]
142
+ y2 = y[:,:,1]
143
+ with torch.no_grad():
144
+ y_scores1, y_scores2 = model(x.to(device))
145
+ loss1 = criterion(y_scores1.view(y1.size(0) * y1.size(1), -1), y1.view(y1.size(0) * y1.size(1)))
146
+ loss2 = criterion(y_scores2.view(y2.size(0) * y2.size(1), -1), y2.view(y2.size(0) * y2.size(1)))
147
+ loss = loss1 + loss2
148
+ y_pred1 = torch.max(y_scores1, 2)[1]
149
+ y_pred2 = torch.max(y_scores2, 2)[1]
150
+ for label in range(1, 5):
151
+ ref = (y1 == label)
152
+ hyp = (y_pred1 == label)
153
+ correct = (ref * hyp == 1)
154
+ num_ref[label] += ref.sum()
155
+ num_hyp[label] += hyp.sum()
156
+ num_correct[label] += correct.sum()
157
+ num_ref[0] += ref.sum()
158
+ num_hyp[0] += hyp.sum()
159
+ num_correct[0] += correct.sum()
160
+ all_correct1 += (y_pred1 == y1).sum()
161
+ all_correct2 += (y_pred2 == y2).sum()
162
+ total_loss += loss.item()
163
+ num_loss += len(y)
164
+ num_perf += len(y) * config.max_length
165
+ recall = {}
166
+ precision = {}
167
+ fscore = {}
168
+ for label in range(0, 5):
169
+ recall[label] = num_correct[label] / num_ref[label] if num_ref[label] > 0 else 0
170
+ precision[label] = num_correct[label] / num_hyp[label] if num_hyp[label] > 0 else 0
171
+ fscore[label] = (2 * recall[label] * precision[label] / (recall[label] + precision[label])).item() if recall[label] + precision[label] > 0 else 0
172
+ return total_loss / num_loss, all_correct2.item() / num_perf, all_correct1.item() / num_perf, fscore
173
+
174
+
175
+ def fit(config, model, checkpoint_path, train_loader, valid_loader, iterations, valid_period=200, lr=1e-5):
176
+ device = config.device
177
+ criterion = nn.CrossEntropyLoss()
178
+ optimizer = optim.Adam(filter(lambda param: param.requires_grad, model.parameters()), lr=lr)
179
+ iteration = 0
180
+ while True:
181
+ model.train()
182
+ total_loss = num = 0
183
+ for x, y in tqdm(train_loader):
184
+ x = x.long().to(device)
185
+ y = y.long().to(device)
186
+ drop_at_boundaries(config.dab_rate, x, y, config.cls_token_id, config.sep_token_id, config.pad_token_id)
187
+ y1 = y[:,:,0]
188
+ y2 = y[:,:,1]
189
+ optimizer.zero_grad()
190
+ y_scores1, y_scores2 = model(x)
191
+ loss1 = criterion(y_scores1.view(y1.size(0) * y1.size(1), -1), y1.view(y1.size(0) * y1.size(1)))
192
+ loss2 = criterion(y_scores2.view(y2.size(0) * y2.size(1), -1), y2.view(y2.size(0) * y2.size(1)))
193
+ loss = loss1 + loss2
194
+ loss.backward()
195
+ optimizer.step()
196
+ total_loss += loss.item()
197
+ num += len(y)
198
+ if iteration % valid_period == valid_period - 1:
199
+ train_loss = total_loss / num
200
+ valid_loss, valid_accuracy_case, valid_accuracy_punc, valid_fscore = compute_performance(config, model, valid_loader)
201
+ torch.save({
202
+ 'iteration': iteration + 1,
203
+ 'model_state_dict': model.state_dict(),
204
+ 'optimizer_state_dict': optimizer.state_dict(),
205
+ 'train_loss': train_loss,
206
+ 'valid_loss': valid_loss,
207
+ 'valid_accuracy_case': valid_accuracy_case,
208
+ 'valid_accuracy_punc': valid_accuracy_punc,
209
+ 'valid_fscore': valid_fscore,
210
+ 'config': config.__dict__,
211
+ }, '%s.%d' % (checkpoint_path, iteration + 1))
212
+ print(iteration + 1, train_loss, valid_loss, valid_accuracy_case, valid_accuracy_punc, valid_fscore)
213
+ total_loss = num = 0
214
+
215
+ iteration += 1
216
+ if iteration > iterations:
217
+ return
218
+
219
+ sys.stderr.flush()
220
+ sys.stdout.flush()
221
+
222
+
223
+ def batchify(max_length, x, y):
224
+ print (x.shape)
225
+ print (y.shape)
226
+ x = x[:(len(x) // max_length) * max_length].reshape(-1, max_length)
227
+ y = y[:(len(y) // max_length) * max_length, :].reshape(-1, max_length, 2)
228
+ return x, y
229
+
230
+
231
+ def train(config, train_x_fn, train_y_fn, valid_x_fn, valid_y_fn, checkpoint_path):
232
+ X_train, Y_train = batchify(config.max_length, torch.load(train_x_fn), torch.load(train_y_fn))
233
+ X_valid, Y_valid = batchify(config.max_length, torch.load(valid_x_fn), torch.load(valid_y_fn))
234
+
235
+ train_set = TensorDataset(X_train, Y_train)
236
+ valid_set = TensorDataset(X_valid, Y_valid)
237
+
238
+ train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
239
+ valid_loader = DataLoader(valid_set, batch_size=config.batch_size)
240
+
241
+ model = Model(config.flavor, config.device)
242
+
243
+ fit(config, model, checkpoint_path, train_loader, valid_loader, config.updates, config.period, config.lr)
244
+
245
+
246
+ def run_eval(config, test_x_fn, test_y_fn, checkpoint_path):
247
+ X_test, Y_test = batchify(config.max_length, torch.load(test_x_fn), torch.load(test_y_fn))
248
+ test_set = TensorDataset(X_test, Y_test)
249
+ test_loader = DataLoader(test_set, batch_size=config.batch_size)
250
+
251
+ loaded = torch.load(checkpoint_path, map_location=config.device)
252
+ if 'config' in loaded:
253
+ config = Config(**loaded['config'])
254
+ init(config)
255
+
256
+ model = Model(config.flavor, config.device)
257
+ model.load_state_dict(loaded['model_state_dict'], strict=False)
258
+
259
+ print(*compute_performance(config, model, test_loader))
260
+
261
+
262
+ def recase(token, label):
263
+ if label == case['LOWER']:
264
+ return token.lower()
265
+ elif label == case['CAPITALIZE']:
266
+ return token.lower().capitalize()
267
+ elif label == case['UPPER']:
268
+ return token.upper()
269
+ else:
270
+ return token
271
+
272
+
273
+ class CasePuncPredictor:
274
+ def __init__(self, checkpoint_path, lang=default_config.lang, flavor=default_config.flavor, device=default_config.device):
275
+ loaded = torch.load(checkpoint_path, map_location=device if torch.cuda.is_available() else 'cpu')
276
+ if 'config' in loaded:
277
+ self.config = Config(**loaded['config'])
278
+ else:
279
+ self.config = Config(lang=lang, flavor=flavor, device=device)
280
+ init(self.config)
281
+
282
+ self.model = Model(self.config.flavor, self.config.device)
283
+ self.model.load_state_dict(loaded['model_state_dict'])
284
+ self.model.eval()
285
+ self.model.to(self.config.device)
286
+
287
+ self.rev_case = {b: a for a, b in case.items()}
288
+ self.rev_punc = {b: a for a, b in punctuation.items()}
289
+
290
+ def tokenize(self, text):
291
+ return [self.config.cls_token] + self.config.tokenizer.tokenize(text) + [self.config.sep_token]
292
+
293
+ def predict(self, tokens, getter=lambda x: x):
294
+ max_length = self.config.max_length
295
+ device = self.config.device
296
+ if type(tokens) == str:
297
+ tokens = self.tokenize(tokens)
298
+ previous_label = punctuation['PERIOD']
299
+ for start in range(0, len(tokens), max_length):
300
+ instance = tokens[start: start + max_length]
301
+ if type(getter(instance[0])) == str:
302
+ ids = self.config.tokenizer.convert_tokens_to_ids(getter(token) for token in instance)
303
+ else:
304
+ ids = [getter(token) for token in instance]
305
+ if len(ids) < max_length:
306
+ ids += [0] * (max_length - len(ids))
307
+ x = torch.tensor([ids]).long().to(device)
308
+ y_scores1, y_scores2 = self.model(x)
309
+ y_pred1 = torch.max(y_scores1, 2)[1]
310
+ y_pred2 = torch.max(y_scores2, 2)[1]
311
+ for i, id, token, punc_label, case_label in zip(range(len(instance)), ids, instance, y_pred1[0].tolist()[:len(instance)], y_pred2[0].tolist()[:len(instance)]):
312
+ if id == self.config.cls_token_id or id == self.config.sep_token_id:
313
+ continue
314
+ if previous_label != None and previous_label > 1:
315
+ if case_label in [case['LOWER'], case['OTHER']]: # LOWER, OTHER
316
+ case_label = case['CAPITALIZE']
317
+ if i + start == len(tokens) - 2 and punc_label == punctuation['O']:
318
+ punc_label = punctuation['PERIOD']
319
+ yield (token, self.rev_case[case_label], self.rev_punc[punc_label])
320
+ previous_label = punc_label
321
+
322
+ def map_case_label(self, token, case_label):
323
+ if token.endswith('</w>'):
324
+ token = token[:-4]
325
+ if token.startswith('##'):
326
+ token = token[2:]
327
+ return recase(token, case[case_label])
328
+
329
+ def map_punc_label(self, token, punc_label):
330
+ if token.endswith('</w>'):
331
+ token = token[:-4]
332
+ if token.startswith('##'):
333
+ token = token[2:]
334
+ return token + punctuation_syms[punctuation[punc_label]]
335
+
336
+
337
+
338
+ def generate_predictions(config, checkpoint_path):
339
+ loaded = torch.load(checkpoint_path, map_location=config.device if torch.cuda.is_available() else 'cpu')
340
+ if 'config' in loaded:
341
+ config = Config(**loaded['config'])
342
+ init(config)
343
+
344
+ model = Model(config.flavor, config.device)
345
+ model.load_state_dict(loaded['model_state_dict'], strict=False)
346
+
347
+ rev_case = {b: a for a, b in case.items()}
348
+ rev_punc = {b: a for a, b in punctuation.items()}
349
+
350
+ for line in sys.stdin:
351
+ # also drop punctuation that we may generate
352
+ line = ''.join([c for c in line if c not in mapped_punctuation])
353
+ if config.debug:
354
+ print(line)
355
+ tokens = [config.cls_token] + config.tokenizer.tokenize(line) + [config.sep_token]
356
+ if config.debug:
357
+ print(tokens)
358
+ previous_label = punctuation['PERIOD']
359
+ first_time = True
360
+ was_word = False
361
+ for start in range(0, len(tokens), config.max_length):
362
+ instance = tokens[start: start + config.max_length]
363
+ ids = config.tokenizer.convert_tokens_to_ids(instance)
364
+ #print(len(ids), file=sys.stderr)
365
+ if len(ids) < config.max_length:
366
+ ids += [config.pad_token_id] * (config.max_length - len(ids))
367
+ x = torch.tensor([ids]).long().to(config.device)
368
+ y_scores1, y_scores2 = model(x)
369
+ y_pred1 = torch.max(y_scores1, 2)[1]
370
+ y_pred2 = torch.max(y_scores2, 2)[1]
371
+ for id, token, punc_label, case_label in zip(ids, instance, y_pred1[0].tolist()[:len(instance)], y_pred2[0].tolist()[:len(instance)]):
372
+ if config.debug:
373
+ print(id, token, punc_label, case_label, file=sys.stderr)
374
+ if id == config.cls_token_id or id == config.sep_token_id:
375
+ continue
376
+ if previous_label != None and previous_label > 1:
377
+ if case_label in [case['LOWER'], case['OTHER']]:
378
+ case_label = case['CAPITALIZE']
379
+ previous_label = punc_label
380
+ # different strategy due to sub-lexical token encoding in Flaubert
381
+ if config.lang == 'fr':
382
+ if token.endswith('</w>'):
383
+ cased_token = recase(token[:-4], case_label)
384
+ if was_word:
385
+ print(' ', end='')
386
+ print(cased_token + punctuation_syms[punc_label], end='')
387
+ was_word = True
388
+ else:
389
+ cased_token = recase(token, case_label)
390
+ if was_word:
391
+ print(' ', end='')
392
+ print(cased_token, end='')
393
+ was_word = False
394
+ else:
395
+ if token.startswith('##'):
396
+ cased_token = recase(token[2:], case_label)
397
+ print(cased_token, end='')
398
+ else:
399
+ cased_token = recase(token, case_label)
400
+ if not first_time:
401
+ print(' ', end='')
402
+ first_time = False
403
+ print(cased_token + punctuation_syms[punc_label], end='')
404
+ if previous_label == 0:
405
+ print('.', end='')
406
+ print()
407
+
408
+
409
+ def label_for_case(token):
410
+ token = re.sub(r'[^\p{Han}\p{Ll}\p{Lu}]', '', token)
411
+ if token == token.lower():
412
+ return 'LOWER'
413
+ elif token == token.lower().capitalize():
414
+ return 'CAPITALIZE'
415
+ elif token == token.upper():
416
+ return 'UPPER'
417
+ else:
418
+ return 'OTHER'
419
+
420
+
421
+ def make_tensors(config, input_fn, output_x_fn, output_y_fn):
422
+ # count file lines without loading them
423
+ size = 0
424
+ with open(input_fn) as fp:
425
+ for line in fp:
426
+ size += 1
427
+
428
+ with open(input_fn) as fp:
429
+ X = torch.IntTensor(size)
430
+ Y = torch.ByteTensor(size, 2)
431
+
432
+ offset = 0
433
+ for n, line in enumerate(fp):
434
+ word, case_label, punc_label = line.strip().split('\t')
435
+ id = config.tokenizer.convert_tokens_to_ids(word)
436
+ if config.debug:
437
+ assert word.lower() == tokenizer.convert_ids_to_tokens(id)
438
+ X[offset] = id
439
+ Y[offset, 0] = punctuation[punc_label]
440
+ Y[offset, 1] = case[case_label]
441
+ offset += 1
442
+
443
+ torch.save(X, output_x_fn)
444
+ torch.save(Y, output_y_fn)
445
+
446
+
447
+ mapped_punctuation = {
448
+ '.': 'PERIOD',
449
+ '...': 'PERIOD',
450
+ ',': 'COMMA',
451
+ ';': 'COMMA',
452
+ ':': 'COMMA',
453
+ '(': 'COMMA',
454
+ ')': 'COMMA',
455
+ '?': 'QUESTION',
456
+ '!': 'EXCLAMATION',
457
+ ',': 'COMMA',
458
+ '!': 'EXCLAMATION',
459
+ '?': 'QUESTION',
460
+ ';': 'COMMA',
461
+ ':': 'COMMA',
462
+ '(': 'COMMA',
463
+ '(': 'COMMA',
464
+ ')': 'COMMA',
465
+ '[': 'COMMA',
466
+ ']': 'COMMA',
467
+ '【': 'COMMA',
468
+ '】': 'COMMA',
469
+ '└': 'COMMA',
470
+ '└ ': 'COMMA',
471
+ '_': 'O',
472
+ '。': 'PERIOD',
473
+ '、': 'COMMA', # enumeration comma
474
+ '、': 'COMMA',
475
+ '…': 'PERIOD',
476
+ '—': 'COMMA',
477
+ '「': 'COMMA',
478
+ '」': 'COMMA',
479
+ '.': 'PERIOD',
480
+ '《': 'O',
481
+ '》': 'O',
482
+ ',': 'COMMA',
483
+ '“': 'O',
484
+ '”': 'O',
485
+ '"': 'O',
486
+ '-': 'O',
487
+ '-': 'O',
488
+ '〉': 'COMMA',
489
+ '〈': 'COMMA',
490
+ '↑': 'O',
491
+ '〔': 'COMMA',
492
+ '〕': 'COMMA',
493
+ }
494
+
495
+ def preprocess_text(config, max_token_count=-1):
496
+ global num_tokens_output
497
+ max_token_count = int(max_token_count)
498
+ num_tokens_output = 0
499
+ def process_segment(text, punctuation):
500
+ global num_tokens_output
501
+ text = text.replace('\t', ' ')
502
+ tokens = config.tokenizer.tokenize(text)
503
+ for i, token in enumerate(tokens):
504
+ case_label = label_for_case(token)
505
+ if i == len(tokens) - 1:
506
+ print(token.lower(), case_label, punctuation, sep='\t')
507
+ else:
508
+ print(token.lower(), case_label, 'O', sep='\t')
509
+ num_tokens_output += 1
510
+ # a bit too ugly, but alternative is to throw an exception
511
+ if max_token_count > 0 and num_tokens_output >= max_token_count:
512
+ sys.exit(0)
513
+
514
+ for line in sys.stdin:
515
+ line = line.strip()
516
+ if line != '':
517
+ line = unicodedata.normalize("NFC", line)
518
+ if config.debug:
519
+ print(line)
520
+ start = 0
521
+ for i, char in enumerate(line):
522
+ if char in mapped_punctuation:
523
+ if i > start and line[start: i].strip() != '':
524
+ process_segment(line[start: i], mapped_punctuation[char])
525
+ start = i + 1
526
+ if start < len(line):
527
+ process_segment(line[start:], 'PERIOD')
528
+
529
+
530
+ def preprocess_text_old_fr(config):
531
+ assert config.lang == 'fr'
532
+ splitsents = MosesSentenceSplitter(lang)
533
+ tokenize = MosesTokenizer(lang, extra=['-no-escape'])
534
+ normalize = MosesPunctuationNormalizer(lang)
535
+
536
+ for line in sys.stdin:
537
+ if line.strip() != '':
538
+ for sentence in splitsents([normalize(line)]):
539
+ tokens = tokenize(sentence)
540
+ previous_token = None
541
+ for token in tokens:
542
+ if token in mapped_punctuation:
543
+ if previous_token != None:
544
+ print(previous_token, mapped_punctuation[token], sep='\t')
545
+ previous_token = None
546
+ elif not re.search(r'[\p{Han}\p{Ll}\p{Lu}\d]', token): # remove non-alphanumeric tokens
547
+ continue
548
+ else:
549
+ if previous_token != None:
550
+ print(previous_token, 'O', sep='\t')
551
+ previous_token = token
552
+ if previous_token != None:
553
+ print(previous_token, 'PERIOD', sep='\t')
554
+
555
+
556
+ # modification of the wordpiece tokenizer to keep case information even if vocab is lower cased
557
+ # forked from https://github.com/huggingface/transformers/blob/master/src/transformers/models/bert/tokenization_bert.py
558
+
559
+ class WordpieceTokenizer(object):
560
+ """Runs WordPiece tokenization."""
561
+
562
+ def __init__(self, vocab, unk_token, max_input_chars_per_word=100, keep_case=True):
563
+ self.vocab = vocab
564
+ self.unk_token = unk_token
565
+ self.max_input_chars_per_word = max_input_chars_per_word
566
+ self.keep_case = keep_case
567
+
568
+ def tokenize(self, text):
569
+ """
570
+ Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
571
+ tokenization using the given vocabulary.
572
+ For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
573
+ Args:
574
+ text: A single token or whitespace separated tokens. This should have
575
+ already been passed through `BasicTokenizer`.
576
+ Returns:
577
+ A list of wordpiece tokens.
578
+ """
579
+
580
+ output_tokens = []
581
+ for token in text.strip().split():
582
+ chars = list(token)
583
+ if len(chars) > self.max_input_chars_per_word:
584
+ output_tokens.append(self.unk_token)
585
+ continue
586
+
587
+ is_bad = False
588
+ start = 0
589
+ sub_tokens = []
590
+ while start < len(chars):
591
+ end = len(chars)
592
+ cur_substr = None
593
+ while start < end:
594
+ substr = "".join(chars[start:end])
595
+ if start > 0:
596
+ substr = "##" + substr
597
+ # optionaly lowercase substring before checking for inclusion in vocab
598
+ if (self.keep_case and substr.lower() in self.vocab) or (substr in self.vocab):
599
+ cur_substr = substr
600
+ break
601
+ end -= 1
602
+ if cur_substr is None:
603
+ is_bad = True
604
+ break
605
+ sub_tokens.append(cur_substr)
606
+ start = end
607
+
608
+ if is_bad:
609
+ output_tokens.append(self.unk_token)
610
+ else:
611
+ output_tokens.extend(sub_tokens)
612
+ return output_tokens
613
+
614
+
615
+ # modification of XLM bpe tokenizer for keeping case information when vocab is lowercase
616
+ # forked from https://github.com/huggingface/transformers/blob/cd56f3fe7eae4a53a9880e3f5e8f91877a78271c/src/transformers/models/xlm/tokenization_xlm.py
617
+ def bpe(self, token):
618
+ def to_lower(pair):
619
+ #print(' ',pair)
620
+ return (pair[0].lower(), pair[1].lower())
621
+
622
+ from transformers.models.xlm.tokenization_xlm import get_pairs
623
+
624
+ word = tuple(token[:-1]) + (token[-1] + "</w>",)
625
+ if token in self.cache:
626
+ return self.cache[token]
627
+ pairs = get_pairs(word)
628
+
629
+ if not pairs:
630
+ return token + "</w>"
631
+
632
+ while True:
633
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(to_lower(pair), float("inf")))
634
+ #print(bigram)
635
+ if to_lower(bigram) not in self.bpe_ranks:
636
+ break
637
+ first, second = bigram
638
+ new_word = []
639
+ i = 0
640
+ while i < len(word):
641
+ try:
642
+ j = word.index(first, i)
643
+ except ValueError:
644
+ new_word.extend(word[i:])
645
+ break
646
+ else:
647
+ new_word.extend(word[i:j])
648
+ i = j
649
+
650
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
651
+ new_word.append(first + second)
652
+ i += 2
653
+ else:
654
+ new_word.append(word[i])
655
+ i += 1
656
+ new_word = tuple(new_word)
657
+ word = new_word
658
+ if len(word) == 1:
659
+ break
660
+ else:
661
+ pairs = get_pairs(word)
662
+ word = " ".join(word)
663
+ if word == "\n </w>":
664
+ word = "\n</w>"
665
+ self.cache[token] = word
666
+ return word
667
+
668
+
669
+
670
+ def init(config):
671
+ init_random(config.seed)
672
+
673
+ if config.lang == 'fr':
674
+ config.tokenizer = tokenizer = AutoTokenizer.from_pretrained(config.flavor, do_lower_case=False)
675
+
676
+ from transformers.models.xlm.tokenization_xlm import XLMTokenizer
677
+ assert isinstance(tokenizer, XLMTokenizer)
678
+
679
+ # monkey patch XLM tokenizer
680
+ import types
681
+ tokenizer.bpe = types.MethodType(bpe, tokenizer)
682
+ else:
683
+ # warning: needs to be BertTokenizer for monkey patching to work
684
+ config.tokenizer = tokenizer = BertTokenizer.from_pretrained(config.flavor, do_lower_case=False)
685
+
686
+ # warning: monkey patch tokenizer to keep case information
687
+ #from recasing_tokenizer import WordpieceTokenizer
688
+ config.tokenizer.wordpiece_tokenizer = WordpieceTokenizer(vocab=tokenizer.vocab, unk_token=tokenizer.unk_token)
689
+
690
+ if config.lang == 'fr':
691
+ config.pad_token_id = tokenizer.pad_token_id
692
+ config.cls_token_id = tokenizer.bos_token_id
693
+ config.cls_token = tokenizer.bos_token
694
+ config.sep_token_id = tokenizer.sep_token_id
695
+ config.sep_token = tokenizer.sep_token
696
+ else:
697
+ config.pad_token_id = tokenizer.pad_token_id
698
+ config.cls_token_id = tokenizer.cls_token_id
699
+ config.cls_token = tokenizer.cls_token
700
+ config.sep_token_id = tokenizer.sep_token_id
701
+ config.sep_token = tokenizer.sep_token
702
+
703
+ if not torch.cuda.is_available() and config.device == 'cuda':
704
+ print('WARNING: reverting to cpu as cuda is not available', file=sys.stderr)
705
+ config.device = torch.device(config.device if torch.cuda.is_available() else 'cpu')
706
+
707
+
708
+ def main(config, action, args):
709
+ init(config)
710
+
711
+ if action == 'train':
712
+ train(config, *args)
713
+ elif action == 'eval':
714
+ run_eval(config, *args)
715
+ elif action == 'predict':
716
+ generate_predictions(config, *args)
717
+ elif action == 'tensorize':
718
+ make_tensors(config, *args)
719
+ elif action == 'preprocess':
720
+ preprocess_text(config, *args)
721
+ else:
722
+ print('invalid action "%s"' % action)
723
+ sys.exit(1)
724
+
725
+ if __name__ == '__main__':
726
+ parser = argparse.ArgumentParser()
727
+ parser.add_argument("action", help="train|eval|predict|tensorize|preprocess", type=str)
728
+ parser.add_argument("action_args", help="arguments for selected action", type=str, nargs='*')
729
+ parser.add_argument("--seed", help="random seed", default=default_config.seed, type=int)
730
+ parser.add_argument("--lang", help="language (fr, en, zh)", default=default_config.lang, type=str)
731
+ parser.add_argument("--flavor", help="bert flavor in transformers model zoo", default=default_config.flavor, type=str)
732
+ parser.add_argument("--max-length", help="maximum input length", default=default_config.max_length, type=int)
733
+ parser.add_argument("--batch-size", help="size of batches", default=default_config.batch_size, type=int)
734
+ parser.add_argument("--device", help="computation device (cuda, cpu)", default=default_config.device, type=str)
735
+ parser.add_argument("--debug", help="whether to output more debug info", default=default_config.debug, type=bool)
736
+ parser.add_argument("--updates", help="number of training updates to perform", default=default_config.updates, type=bool)
737
+ parser.add_argument("--period", help="validation period in updates", default=default_config.period, type=bool)
738
+ parser.add_argument("--lr", help="learning rate", default=default_config.lr, type=bool)
739
+ parser.add_argument("--dab-rate", help="drop at boundaries rate", default=default_config.dab_rate, type=bool)
740
+ config = Config(**parser.parse_args().__dict__)
741
+
742
+ main(config, config.action, config.action_args)
743
+
744
+
recasepunc/vosk-adapted.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ the
2
+ the
3
+ the beijing and shanghai welcome to the market strata open i'm yvonne good morning and i'm david ingles counting down of course the diablo trade on the chinese
4
+ mainland here in hong kong let's get your top stories today taper and a timetable dominating the latest fed minutes as official debates the exit path meanwhile i got beijing heading the other way hinting at the first triple r cut in more than a year and after the didi debacle here china may move to close a loophole long used
5
+ by companies to take their listings abroad all to enhance that was a horrible mistake council yesterday from china as a maybe it's time to cut the triple r to help them with small businesses they are struggling from the rise of raw material costs the key question is how likely is this yeah what they say it chances are likely it's probably going to be up yet
6
+ the fact that they're saying it might actually already mean we're getting some sentiment coming through in terms of an improved material tracker ten year yield we'll get to that in just a moment in china we're now flirting with the three percent level equity markets futures are pointing up as you can see here in china though broadly speaking though we're down for a seven day across asia seventh day in the last excuse me
7
+ in the last eight sessions here have little commodity markets we're stabilising across your oil or oil prices we're still down five six per cent from highs though as far as that is concerned fx markets your story is guys can we change the police are we're looking at generally speaking the dollar that's very much in focus here so you look at that against the euro you look at that
8
+ against the chinese currency twenty four hours ago who would have thought we were talking about this sort of more divergence and starker labour discord between where you are in a pboc to easily in the fed and very quickly we alluded to this of course if one three percent on your chinese ten year yield and we're not one point three percent lower and lower
9
+ yields there is a charge for you china's top us ten year yield is at the bottom yeah the chinatown area lowest since we saw last year of september yup
10
+ yeah it is a really big major shift in china's central bank policy that's the key question could it be coming of course let's flash out that into what we heard from the cabinet there raising the possibility of a cut to the reserve requirement ratio to both the economy at the same time we also from a former pboc official sheng songcheng said the central bank should actually
11
+ cut rates he's not just talking about a triple r and either the second half is an important window when china's monetary policy can tilt towards loosening while remaining stable and the interest rates can be lowered in a reasonable and moderate manner let's get the take from also be as well whether daisy i'm david chiu here the short of it is
12
+ so i guess one point if we still haven't gotten that if in the event that we do their take is they it might be a little bit too aggressive to address some of the softness in the economy in other words what they're saying is it needs some help the economy maybe not this much yeah there preferring perhaps perhaps liquidity injections here and there but this might signal a bit too much
13
+ for when it comes to reflating the economy joining us out of the dice all this let's bring in wang tao ubi as head of asia economics and the chief china economists as well wang tao thanks much for joining us first off do you think this is actually a real possibility now
14
+ or well will shrink or fade contro as a frequently called using triple r cut as a tool so i think yes indeed it is a real possibility that they could do this however in the past whenever the state council called for this a few days to a couple of weeks later we were
15
+ would have we would see a triple r cut if they called for it and but it's worth noting that last year in june shoot at the chicago auto quote for it and by the pbc did not hold onto with any market so i i would say at this moment it's probably a relatively high likelihood but anything
16
+ the wording is really you know about mitigating the higher cost of commodity prices they impact on at an ease and make their effective conquered funding a bit lower so it's possible that it's going to be a targeted not a overall triple cut and i i don't think this really reflects a
17
+ wholesale shift in monetary policy i think very very much in the same state concrete statement also talked about
recasepunc/vosk-adapted.txt.punc ADDED
@@ -0,0 +1 @@
 
 
1
+ The. The. The Beijing and Shanghai. Welcome to the market strata open. I'm Yvonne, good morning, and I'm David Ingles, counting down, of course, the Diablo trade on the Chinese mainland here in Hong Kong. Let's get your top stories today, taper and a timetable dominating the latest Fed minutes as official debates. The exit path. Meanwhile, I got Beijing heading the other way, hinting at the first triple R cut in more than a year. And after the Didi debacle here, China may move to close a loophole. Long used by companies to take their listings abroad, all to enhance. That was a horrible mistake. Council yesterday from China as a. Maybe it's time to cut the triple R to help them with small businesses they are struggling from the rise of raw material costs. The key question is, how likely is this ? Yeah, what they say it. Chances are likely it's probably going to be up yet. The fact that they're saying it might actually already mean we're getting some sentiment coming through in terms of an improved material tracker. Ten year yield. We'll get to that in just a moment. In China. We're now flirting with the three percent level equity markets futures are pointing up. As you can see here in China, though. Broadly speaking, though, we're down for a seven day across Asia. Seventh day in the last. Excuse me, in the last eight sessions here have little commodity markets. We're stabilising across your oil or oil prices. We're still down five, six per cent from highs, though as far as that is concerned FX markets. Your story is, guys, can we change the police are we're looking at, generally speaking, the dollar. That's very much in focus here. So you look at that against the euro. You look at that against the Chinese currency Twenty four hours ago. Who would have thought we were talking about this sort of more divergence and starker labour discord between where you are in a PBOC to easily in the Fed and very quickly. We alluded to this, Of course, if one three percent on your Chinese ten year yield and we're not one point three percent lower and lower yields, there is a charge for you. China's top US ten year yield is at the bottom. Yeah, the Chinatown area lowest since we saw last year of September. Yup. Yeah, it is a really big major shift in China's central bank policy. That's the key question. Could it be coming ? Of course. Let's flash out that into what we heard from the cabinet there, raising the possibility of a cut to the reserve requirement ratio to both the economy at the same time. We also from a former PBOC official, Sheng Songcheng said the central bank should actually cut rates. He's not just talking about a triple R. And either the second half is an important window when China's monetary policy can tilt towards loosening while remaining stable and the interest rates can be lowered in a reasonable and moderate manner. Let's get the take from also be as well, whether Daisy, I'm David Chiu here, the short of it is so I guess one point, if we still haven't gotten that if in the event that we do their take is they, it might be a little bit too aggressive to address some of the softness in the economy. In other words, what they're saying is it needs some help. The economy, maybe not this much. Yeah, there, preferring perhaps perhaps liquidity injections here and there. But this might signal a bit too much for when it comes to reflating the economy. Joining us out of the dice. All this, Let's bring in Wang Tao Ubi as head of Asia Economics, and the chief China economists as well. Wang Tao, thanks much for joining us. First off, do you think this is actually a real possibility now or well will shrink or fade ? Contro as a frequently called using triple R cut as a tool. So I think yes, indeed, it is a real possibility. That they could do this. However, in the past, whenever the State Council called for this a few days to a couple of weeks later, we were. Would have we would see a triple R cut if they called for it. And. But it's worth noting that last year in June, shoot at the Chicago auto quote for it and by the PBC did not hold onto with any market so I. I would say at this moment it's probably a relatively high likelihood, but anything. The wording is really, you know about mitigating the higher cost of commodity prices they impact on at an ease and make their effective conquered funding a bit lower. So it's possible that it's going to be a targeted, not a overall triple cut and I. I don't think this really reflects a wholesale shift in monetary policy. I think very, very much in the same state. Concrete statement also talked about.
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Flask==2.0.1
2
+ gunicorn==20.1.0
3
+ Werkzeug==2.0.1
4
+
5
+
temp_audio.wav ADDED
Binary file (639 kB). View file
 
voice_to_text_systemdev -checkpoint-checkpoint.ipynb ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 29,
6
+ "id": "5c7d8fe6-69ca-4f29-9046-0b0bc9f31911",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "data": {
11
+ "application/vnd.jupyter.widget-view+json": {
12
+ "model_id": "99ee6b03c5154644998c23c837444e83",
13
+ "version_major": 2,
14
+ "version_minor": 0
15
+ },
16
+ "text/plain": [
17
+ "HBox(children=(Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle()), B…"
18
+ ]
19
+ },
20
+ "metadata": {},
21
+ "output_type": "display_data"
22
+ },
23
+ {
24
+ "data": {
25
+ "application/vnd.jupyter.widget-view+json": {
26
+ "model_id": "2b3e4f24da8d4c198b5d15f0f3f7399d",
27
+ "version_major": 2,
28
+ "version_minor": 0
29
+ },
30
+ "text/plain": [
31
+ "Output()"
32
+ ]
33
+ },
34
+ "metadata": {},
35
+ "output_type": "display_data"
36
+ }
37
+ ],
38
+ "source": [
39
+ "import ipywidgets as widgets\n",
40
+ "from IPython.display import display, clear_output\n",
41
+ "from threading import Thread\n",
42
+ "from queue import Queue\n",
43
+ "import time\n",
44
+ "\n",
45
+ "messages = Queue()\n",
46
+ "recordings = Queue()\n",
47
+ "\n",
48
+ "record_button = widgets.Button(\n",
49
+ " description=\"Record\",\n",
50
+ " disabled=False,\n",
51
+ " button_style=\"success\",\n",
52
+ " icon=\"microphone\"\n",
53
+ ")\n",
54
+ "\n",
55
+ "stop_button = widgets.Button(\n",
56
+ " description=\"Stop\",\n",
57
+ " disabled=False,\n",
58
+ " button_style=\"warning\",\n",
59
+ " icon=\"stop\"\n",
60
+ ")\n",
61
+ "\n",
62
+ "output = widgets.Output()\n",
63
+ "\n",
64
+ "def record_microphone():\n",
65
+ " while not messages.empty():\n",
66
+ " time.sleep(1) # Simulate recording\n",
67
+ " recordings.put(\"Audio recorded.\") # Simulated recorded audio data\n",
68
+ "\n",
69
+ "def speech_recognition(output_widget):\n",
70
+ " while not messages.empty():\n",
71
+ " time.sleep(2) # Simulate transcription\n",
72
+ " with output_widget:\n",
73
+ " clear_output(wait=True)\n",
74
+ " display(\"Transcription: Hello, how are you?\") # Simulated transcription result\n",
75
+ "\n",
76
+ "def start_recording(data):\n",
77
+ " if not messages.empty():\n",
78
+ " return # Recording already in progress\n",
79
+ "\n",
80
+ " messages.put(True)\n",
81
+ " with output:\n",
82
+ " clear_output(wait=True)\n",
83
+ " display(\"Starting...\")\n",
84
+ "\n",
85
+ " record = Thread(target=record_microphone)\n",
86
+ " record.start()\n",
87
+ "\n",
88
+ " transcribe = Thread(target=speech_recognition, args=(output,))\n",
89
+ " transcribe.start()\n",
90
+ "\n",
91
+ "def stop_recording(data):\n",
92
+ " if messages.empty():\n",
93
+ " return # No recording in progress\n",
94
+ "\n",
95
+ " messages.get()\n",
96
+ " with output:\n",
97
+ " clear_output(wait=True)\n",
98
+ " display(\"Stopped.\")\n",
99
+ "\n",
100
+ "record_button.on_click(start_recording)\n",
101
+ "stop_button.on_click(stop_recording)\n",
102
+ "\n",
103
+ "display(widgets.HBox([record_button, stop_button]), output)\n"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 30,
109
+ "id": "bdcb9097-ab31-4dcc-9e2a-4e0818fceb3f",
110
+ "metadata": {},
111
+ "outputs": [
112
+ {
113
+ "name": "stdout",
114
+ "output_type": "stream",
115
+ "text": [
116
+ "Requirement already satisfied: pyaudio in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (0.2.14)\n"
117
+ ]
118
+ }
119
+ ],
120
+ "source": [
121
+ "!python -m pip install pyaudio"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 31,
127
+ "id": "34112777-1845-4aff-80de-099ceed52f01",
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "name": "stdout",
132
+ "output_type": "stream",
133
+ "text": [
134
+ "{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
135
+ "{'index': 1, 'structVersion': 2, 'name': 'Microphone Array (Intel® Smart ', 'hostApi': 0, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
136
+ "{'index': 2, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Output', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
137
+ "{'index': 3, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
138
+ "{'index': 4, 'structVersion': 2, 'name': 'Primary Sound Capture Driver', 'hostApi': 1, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.12, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.24, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 44100.0}\n",
139
+ "{'index': 5, 'structVersion': 2, 'name': 'Microphone Array (Intel® Smart Sound Technology (Intel® SST))', 'hostApi': 1, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.12, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.24, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 44100.0}\n",
140
+ "{'index': 6, 'structVersion': 2, 'name': 'Primary Sound Driver', 'hostApi': 1, 'maxInputChannels': 0, 'maxOutputChannels': 8, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.12, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.24, 'defaultSampleRate': 44100.0}\n",
141
+ "{'index': 7, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 1, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.12, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.24, 'defaultSampleRate': 44100.0}\n",
142
+ "{'index': 8, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 2, 'maxInputChannels': 0, 'maxOutputChannels': 8, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.003, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.01, 'defaultSampleRate': 48000.0}\n",
143
+ "{'index': 9, 'structVersion': 2, 'name': 'Microphone Array (Intel® Smart Sound Technology (Intel® SST))', 'hostApi': 2, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.002, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.01, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 48000.0}\n",
144
+ "{'index': 10, 'structVersion': 2, 'name': 'Microphone Array 1 (Intel® Smart Sound Technology (Intel® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
145
+ "{'index': 11, 'structVersion': 2, 'name': 'Microphone Array 2 (Intel® Smart Sound Technology (Intel® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 16000.0}\n",
146
+ "{'index': 12, 'structVersion': 2, 'name': 'Microphone Array 3 (Intel® Smart Sound Technology (Intel® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 16000.0}\n",
147
+ "{'index': 13, 'structVersion': 2, 'name': 'Stereo Mix (Realtek HD Audio Stereo input)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
148
+ "{'index': 14, 'structVersion': 2, 'name': 'Headphones (Realtek HD Audio 2nd output with SST)', 'hostApi': 3, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
149
+ "{'index': 15, 'structVersion': 2, 'name': 'Speakers (Realtek HD Audio output with SST)', 'hostApi': 3, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
150
+ "{'index': 16, 'structVersion': 2, 'name': 'Microphone (Realtek HD Audio Mic input)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 44100.0}\n"
151
+ ]
152
+ }
153
+ ],
154
+ "source": [
155
+ "import pyaudio\n",
156
+ "\n",
157
+ "p = pyaudio.PyAudio()\n",
158
+ "for i in range(p.get_device_count()):\n",
159
+ " print(p.get_device_info_by_index(i))\n",
160
+ "\n",
161
+ "p.terminate()"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 32,
167
+ "id": "2e74dacf-1a91-4dfa-bf91-c64c72755d75",
168
+ "metadata": {},
169
+ "outputs": [],
170
+ "source": [
171
+ "import pyaudio\n",
172
+ "from queue import Queue\n",
173
+ "\n",
174
+ "CHANNELS = 1\n",
175
+ "FRAME_RATE = 16000\n",
176
+ "RECORD_SECONDS = 20\n",
177
+ "AUDIO_FORMAT = pyaudio.paInt16\n",
178
+ "SAMPLE_SIZE = 2\n",
179
+ "\n",
180
+ "messages = Queue()\n",
181
+ "recordings = Queue()\n",
182
+ "\n",
183
+ "def record_microphone(chunk=1024):\n",
184
+ " p = pyaudio.PyAudio()\n",
185
+ "\n",
186
+ " stream = p.open(format=AUDIO_FORMAT,\n",
187
+ " channels=CHANNELS,\n",
188
+ " rate=FRAME_RATE,\n",
189
+ " input=True,\n",
190
+ " input_device_index=1,\n",
191
+ " frames_per_buffer=chunk)\n",
192
+ "\n",
193
+ " frames = []\n",
194
+ "\n",
195
+ " while not messages.empty():\n",
196
+ " data = stream.read(chunk)\n",
197
+ " frames.append(data)\n",
198
+ "\n",
199
+ " if len(frames) >= int(FRAME_RATE * RECORD_SECONDS / chunk):\n",
200
+ " recordings.put(frames.copy())\n",
201
+ " frames = []\n",
202
+ "\n",
203
+ " stream.stop_stream()\n",
204
+ " stream.close()\n",
205
+ " p.terminate()\n"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": 33,
211
+ "id": "931dc754-e034-45e7-981b-a9210c1fe6e9",
212
+ "metadata": {},
213
+ "outputs": [],
214
+ "source": [
215
+ "import subprocess\n",
216
+ "import json\n",
217
+ "from vosk import Model, KaldiRecognizer\n",
218
+ "\n",
219
+ "model = Model(model_name=\"vosk-model-en-us-0.42-gigaspeech\")\n",
220
+ "rec = KaldiRecognizer(model, FRAME_RATE)\n",
221
+ "rec.SetWords(True)\n",
222
+ "\n",
223
+ "def speech_recognition(output):\n",
224
+ " while not messages.empty():\n",
225
+ " frames = recordings.get()\n",
226
+ "\n",
227
+ " rec.AcceptWaveform(b''.join(frames))\n",
228
+ " result = rec.Result()\n",
229
+ " text = json.loads(result)[\"text\"]\n",
230
+ "\n",
231
+ " cased = subprocess.check_output(\"python recasepunc/recasepunc.py predict recasepunc/checkpoint\", shell=True, text=True, input=text)\n",
232
+ " output.append_stdout(cased)"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 1,
238
+ "id": "a27fb138-d3a9-4e04-83fe-23aca2921d92",
239
+ "metadata": {},
240
+ "outputs": [
241
+ {
242
+ "name": "stdout",
243
+ "output_type": "stream",
244
+ "text": [
245
+ "Requirement already satisfied: gradio in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (4.36.1)\n",
246
+ "Requirement already satisfied: aiofiles<24.0,>=22.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (23.2.1)\n",
247
+ "Requirement already satisfied: altair<6.0,>=4.2.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (5.3.0)\n",
248
+ "Requirement already satisfied: fastapi in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (0.111.0)\n",
249
+ "Requirement already satisfied: ffmpy in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (0.3.2)\n",
250
+ "Requirement already satisfied: gradio-client==1.0.1 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (1.0.1)\n",
251
+ "Requirement already satisfied: httpx>=0.24.1 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (0.27.0)\n",
252
+ "Requirement already satisfied: huggingface-hub>=0.19.3 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (0.23.4)\n",
253
+ "Requirement already satisfied: importlib-resources<7.0,>=1.3 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (6.4.0)\n",
254
+ "Requirement already satisfied: jinja2<4.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (3.1.4)\n",
255
+ "Requirement already satisfied: markupsafe~=2.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (2.1.5)\n",
256
+ "Requirement already satisfied: matplotlib~=3.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (3.9.0)\n",
257
+ "Requirement already satisfied: numpy<3.0,>=1.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (1.26.4)\n",
258
+ "Requirement already satisfied: orjson~=3.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (3.10.5)\n",
259
+ "Requirement already satisfied: packaging in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (24.1)\n",
260
+ "Requirement already satisfied: pandas<3.0,>=1.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (2.2.2)\n",
261
+ "Requirement already satisfied: pillow<11.0,>=8.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (10.3.0)\n",
262
+ "Requirement already satisfied: pydantic>=2.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (2.7.4)\n",
263
+ "Requirement already satisfied: pydub in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (0.25.1)\n",
264
+ "Requirement already satisfied: python-multipart>=0.0.9 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (0.0.9)\n",
265
+ "Requirement already satisfied: pyyaml<7.0,>=5.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (6.0.1)\n",
266
+ "Requirement already satisfied: ruff>=0.2.2 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (0.4.10)\n",
267
+ "Requirement already satisfied: semantic-version~=2.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (2.10.0)\n",
268
+ "Requirement already satisfied: tomlkit==0.12.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (0.12.0)\n",
269
+ "Requirement already satisfied: typer<1.0,>=0.12 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (0.12.3)\n",
270
+ "Requirement already satisfied: typing-extensions~=4.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (4.12.2)\n",
271
+ "Requirement already satisfied: urllib3~=2.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (2.2.1)\n",
272
+ "Requirement already satisfied: uvicorn>=0.14.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio) (0.30.1)\n",
273
+ "Requirement already satisfied: fsspec in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio-client==1.0.1->gradio) (2024.6.0)\n",
274
+ "Requirement already satisfied: websockets<12.0,>=10.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from gradio-client==1.0.1->gradio) (11.0.3)\n",
275
+ "Requirement already satisfied: jsonschema>=3.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from altair<6.0,>=4.2.0->gradio) (4.22.0)\n",
276
+ "Requirement already satisfied: toolz in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from altair<6.0,>=4.2.0->gradio) (0.12.1)\n",
277
+ "Requirement already satisfied: anyio in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from httpx>=0.24.1->gradio) (4.4.0)\n",
278
+ "Requirement already satisfied: certifi in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from httpx>=0.24.1->gradio) (2024.6.2)\n",
279
+ "Requirement already satisfied: httpcore==1.* in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from httpx>=0.24.1->gradio) (1.0.5)\n",
280
+ "Requirement already satisfied: idna in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from httpx>=0.24.1->gradio) (3.7)\n",
281
+ "Requirement already satisfied: sniffio in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from httpx>=0.24.1->gradio) (1.3.1)\n",
282
+ "Requirement already satisfied: h11<0.15,>=0.13 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from httpcore==1.*->httpx>=0.24.1->gradio) (0.14.0)\n",
283
+ "Requirement already satisfied: filelock in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from huggingface-hub>=0.19.3->gradio) (3.15.1)\n",
284
+ "Requirement already satisfied: requests in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from huggingface-hub>=0.19.3->gradio) (2.32.3)\n",
285
+ "Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from huggingface-hub>=0.19.3->gradio) (4.66.4)\n",
286
+ "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib~=3.0->gradio) (1.2.1)\n",
287
+ "Requirement already satisfied: cycler>=0.10 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib~=3.0->gradio) (0.12.1)\n",
288
+ "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib~=3.0->gradio) (4.53.0)\n",
289
+ "Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib~=3.0->gradio) (1.4.5)\n",
290
+ "Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib~=3.0->gradio) (3.1.2)\n",
291
+ "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib~=3.0->gradio) (2.9.0.post0)\n",
292
+ "Requirement already satisfied: pytz>=2020.1 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas<3.0,>=1.0->gradio) (2024.1)\n",
293
+ "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas<3.0,>=1.0->gradio) (2024.1)\n",
294
+ "Requirement already satisfied: annotated-types>=0.4.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pydantic>=2.0->gradio) (0.7.0)\n",
295
+ "Requirement already satisfied: pydantic-core==2.18.4 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pydantic>=2.0->gradio) (2.18.4)\n",
296
+ "Requirement already satisfied: click>=8.0.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from typer<1.0,>=0.12->gradio) (8.1.7)\n",
297
+ "Requirement already satisfied: shellingham>=1.3.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from typer<1.0,>=0.12->gradio) (1.5.4)\n",
298
+ "Requirement already satisfied: rich>=10.11.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from typer<1.0,>=0.12->gradio) (13.7.1)\n",
299
+ "Requirement already satisfied: starlette<0.38.0,>=0.37.2 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from fastapi->gradio) (0.37.2)\n",
300
+ "Requirement already satisfied: fastapi-cli>=0.0.2 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from fastapi->gradio) (0.0.4)\n",
301
+ "Requirement already satisfied: ujson!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,>=4.0.1 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from fastapi->gradio) (5.10.0)\n",
302
+ "Requirement already satisfied: email_validator>=2.0.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from fastapi->gradio) (2.2.0)\n",
303
+ "Requirement already satisfied: colorama in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from click>=8.0.0->typer<1.0,>=0.12->gradio) (0.4.6)\n",
304
+ "Requirement already satisfied: dnspython>=2.0.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from email_validator>=2.0.0->fastapi->gradio) (2.6.1)\n",
305
+ "Requirement already satisfied: attrs>=22.2.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (23.2.0)\n",
306
+ "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (2023.12.1)\n",
307
+ "Requirement already satisfied: referencing>=0.28.4 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (0.35.1)\n",
308
+ "Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (0.18.1)\n",
309
+ "Requirement already satisfied: six>=1.5 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from python-dateutil>=2.7->matplotlib~=3.0->gradio) (1.16.0)\n",
310
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (3.0.0)\n",
311
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (2.18.0)\n",
312
+ "Requirement already satisfied: httptools>=0.5.0 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->gradio) (0.6.1)\n",
313
+ "Requirement already satisfied: python-dotenv>=0.13 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->gradio) (1.0.1)\n",
314
+ "Requirement already satisfied: watchfiles>=0.13 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->gradio) (0.22.0)\n",
315
+ "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from requests->huggingface-hub>=0.19.3->gradio) (3.3.2)\n",
316
+ "Requirement already satisfied: mdurl~=0.1 in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0,>=0.12->gradio) (0.1.2)\n",
317
+ "Note: you may need to restart the kernel to use updated packages.\n"
318
+ ]
319
+ }
320
+ ],
321
+ "source": [
322
+ "pip install gradio\n"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": 2,
328
+ "id": "6d7852a7-88e5-4e39-afae-da0bad2f72e5",
329
+ "metadata": {},
330
+ "outputs": [],
331
+ "source": [
332
+ "def my_function(input1, input2):\n",
333
+ " # Process the inputs and generate the output\n",
334
+ " output = f\"Processed {input1} and {input2}\"\n",
335
+ " return output\n"
336
+ ]
337
+ },
338
+ {
339
+ "cell_type": "code",
340
+ "execution_count": 5,
341
+ "id": "c57fb014-6562-4909-b3d2-52a048c9af18",
342
+ "metadata": {},
343
+ "outputs": [
344
+ {
345
+ "name": "stdout",
346
+ "output_type": "stream",
347
+ "text": [
348
+ "Running on local URL: http://127.0.0.1:7861\n",
349
+ "Running on public URL: https://4e26f42d95143ec249.gradio.live\n",
350
+ "\n",
351
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
352
+ ]
353
+ },
354
+ {
355
+ "data": {
356
+ "text/html": [
357
+ "<div><iframe src=\"https://4e26f42d95143ec249.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
358
+ ],
359
+ "text/plain": [
360
+ "<IPython.core.display.HTML object>"
361
+ ]
362
+ },
363
+ "metadata": {},
364
+ "output_type": "display_data"
365
+ },
366
+ {
367
+ "data": {
368
+ "text/plain": []
369
+ },
370
+ "execution_count": 5,
371
+ "metadata": {},
372
+ "output_type": "execute_result"
373
+ }
374
+ ],
375
+ "source": [
376
+ "import gradio as gr\n",
377
+ "\n",
378
+ "# Define the function you want to expose through Gradio\n",
379
+ "def my_function(input1, input2):\n",
380
+ " output = f\"Processed {input1} and {input2}\"\n",
381
+ " return output\n",
382
+ "\n",
383
+ "# Create the Gradio interface\n",
384
+ "iface = gr.Interface(\n",
385
+ " fn=my_function,\n",
386
+ " inputs=[gr.Textbox(label=\"Input 1\"), gr.Textbox(label=\"Input 2\")],\n",
387
+ " outputs=gr.Textbox(label=\"Output\")\n",
388
+ ")\n",
389
+ "\n",
390
+ "# Launch the interface with a public link\n",
391
+ "iface.launch(share=True)\n"
392
+ ]
393
+ },
394
+ {
395
+ "cell_type": "code",
396
+ "execution_count": null,
397
+ "id": "bc4e1d90-6688-4205-a0d2-7933fcdc5874",
398
+ "metadata": {},
399
+ "outputs": [],
400
+ "source": []
401
+ }
402
+ ],
403
+ "metadata": {
404
+ "kernelspec": {
405
+ "display_name": "Python 3 (ipykernel)",
406
+ "language": "python",
407
+ "name": "python3"
408
+ },
409
+ "language_info": {
410
+ "codemirror_mode": {
411
+ "name": "ipython",
412
+ "version": 3
413
+ },
414
+ "file_extension": ".py",
415
+ "mimetype": "text/x-python",
416
+ "name": "python",
417
+ "nbconvert_exporter": "python",
418
+ "pygments_lexer": "ipython3",
419
+ "version": "3.12.4"
420
+ }
421
+ },
422
+ "nbformat": 4,
423
+ "nbformat_minor": 5
424
+ }
voice_to_text_systemdev -checkpoint-checkpoint.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[29]:
5
+
6
+
7
+ import ipywidgets as widgets
8
+ from IPython.display import display, clear_output
9
+ from threading import Thread
10
+ from queue import Queue
11
+ import time
12
+
13
+ messages = Queue()
14
+ recordings = Queue()
15
+
16
+ record_button = widgets.Button(
17
+ description="Record",
18
+ disabled=False,
19
+ button_style="success",
20
+ icon="microphone"
21
+ )
22
+
23
+ stop_button = widgets.Button(
24
+ description="Stop",
25
+ disabled=False,
26
+ button_style="warning",
27
+ icon="stop"
28
+ )
29
+
30
+ output = widgets.Output()
31
+
32
+ def record_microphone():
33
+ while not messages.empty():
34
+ time.sleep(1) # Simulate recording
35
+ recordings.put("Audio recorded.") # Simulated recorded audio data
36
+
37
+ def speech_recognition(output_widget):
38
+ while not messages.empty():
39
+ time.sleep(2) # Simulate transcription
40
+ with output_widget:
41
+ clear_output(wait=True)
42
+ display("Transcription: Hello, how are you?") # Simulated transcription result
43
+
44
+ def start_recording(data):
45
+ if not messages.empty():
46
+ return # Recording already in progress
47
+
48
+ messages.put(True)
49
+ with output:
50
+ clear_output(wait=True)
51
+ display("Starting...")
52
+
53
+ record = Thread(target=record_microphone)
54
+ record.start()
55
+
56
+ transcribe = Thread(target=speech_recognition, args=(output,))
57
+ transcribe.start()
58
+
59
+ def stop_recording(data):
60
+ if messages.empty():
61
+ return # No recording in progress
62
+
63
+ messages.get()
64
+ with output:
65
+ clear_output(wait=True)
66
+ display("Stopped.")
67
+
68
+ record_button.on_click(start_recording)
69
+ stop_button.on_click(stop_recording)
70
+
71
+ display(widgets.HBox([record_button, stop_button]), output)
72
+
73
+
74
+ # In[30]:
75
+
76
+
77
+ get_ipython().system('python -m pip install pyaudio')
78
+
79
+
80
+ # In[31]:
81
+
82
+
83
+ import pyaudio
84
+
85
+ p = pyaudio.PyAudio()
86
+ for i in range(p.get_device_count()):
87
+ print(p.get_device_info_by_index(i))
88
+
89
+ p.terminate()
90
+
91
+
92
+ # In[32]:
93
+
94
+
95
+ import pyaudio
96
+ from queue import Queue
97
+
98
+ CHANNELS = 1
99
+ FRAME_RATE = 16000
100
+ RECORD_SECONDS = 20
101
+ AUDIO_FORMAT = pyaudio.paInt16
102
+ SAMPLE_SIZE = 2
103
+
104
+ messages = Queue()
105
+ recordings = Queue()
106
+
107
+ def record_microphone(chunk=1024):
108
+ p = pyaudio.PyAudio()
109
+
110
+ stream = p.open(format=AUDIO_FORMAT,
111
+ channels=CHANNELS,
112
+ rate=FRAME_RATE,
113
+ input=True,
114
+ input_device_index=1,
115
+ frames_per_buffer=chunk)
116
+
117
+ frames = []
118
+
119
+ while not messages.empty():
120
+ data = stream.read(chunk)
121
+ frames.append(data)
122
+
123
+ if len(frames) >= int(FRAME_RATE * RECORD_SECONDS / chunk):
124
+ recordings.put(frames.copy())
125
+ frames = []
126
+
127
+ stream.stop_stream()
128
+ stream.close()
129
+ p.terminate()
130
+
131
+
132
+ # In[33]:
133
+
134
+
135
+ import subprocess
136
+ import json
137
+ from vosk import Model, KaldiRecognizer
138
+
139
+ model = Model(model_name="vosk-model-en-us-0.42-gigaspeech")
140
+ rec = KaldiRecognizer(model, FRAME_RATE)
141
+ rec.SetWords(True)
142
+
143
+ def speech_recognition(output):
144
+ while not messages.empty():
145
+ frames = recordings.get()
146
+
147
+ rec.AcceptWaveform(b''.join(frames))
148
+ result = rec.Result()
149
+ text = json.loads(result)["text"]
150
+
151
+ cased = subprocess.check_output("python recasepunc/recasepunc.py predict recasepunc/checkpoint", shell=True, text=True, input=text)
152
+ output.append_stdout(cased)
153
+
154
+
155
+ # In[1]:
156
+
157
+
158
+ pip install gradio
159
+
160
+
161
+ # In[2]:
162
+
163
+
164
+ def my_function(input1, input2):
165
+ # Process the inputs and generate the output
166
+ output = f"Processed {input1} and {input2}"
167
+ return output
168
+
169
+
170
+ # In[5]:
171
+
172
+
173
+ import gradio as gr
174
+
175
+ # Define the function you want to expose through Gradio
176
+ def my_function(input1, input2):
177
+ output = f"Processed {input1} and {input2}"
178
+ return output
179
+
180
+ # Create the Gradio interface
181
+ iface = gr.Interface(
182
+ fn=my_function,
183
+ inputs=[gr.Textbox(label="Input 1"), gr.Textbox(label="Input 2")],
184
+ outputs=gr.Textbox(label="Output")
185
+ )
186
+
187
+ # Launch the interface with a public link
188
+ iface.launch(share=True)
189
+
190
+
191
+ # In[ ]:
192
+
193
+
194
+
195
+