hedhoud12 commited on
Commit
4588bab
·
1 Parent(s): fbd9872

upload the model, WebRTC app and the inference script

Browse files
Demo-WebRTC/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # webrtc vosk-server
2
+
3
+ ## Setup environment and run it.
4
+
5
+ ### Set model path
6
+
7
+ Setup path to ./model
8
+ The models can be download from here https://alphacephei.com/vosk/models
9
+
10
+ ### Python environment
11
+
12
+ The sample can work in python 3.8
13
+
14
+ ```sh
15
+ $ python3 -m pip install aiortc aiohttp aiorpc vosk
16
+ ```
17
+
18
+ If your system fails installing aiortc, please install gcc in your environment and use pip to install aiortc again.
19
+
20
+ ### Execution in local
21
+
22
+ Run the server:
23
+
24
+ ```sh
25
+ $ python3 asr_server_webrtc.py
26
+ ```
27
+
28
+ Now, open a web browser with URL http://localhost:2700/.
29
+
30
+ ### Execution in LAN
31
+
32
+ To test the demo from another computer on the LAN, the web page must be served through HTTPS. This is because modern web browsers (such as Chrome, Firefox) don't allow access to the microphone unless the host is `localhost` or the page is served securely.
33
+
34
+ Thus, an SSL certificate is required to test the demo from other computers or smartphones. An untrusted self-signed certificate will work fine on most browsers (iOS Safari is the exception). You can use [mkcert](https://github.com/FiloSottile/mkcert) to make your own self-signed *cert* and *key* files.
35
+
36
+ ```sh
37
+ $ export VOSK_CERT_FILE="/path/to/cert.pem"
38
+ $ export VOSK_KEY_FILE="/path/to/key.pem"
39
+ $ python3 asr_server_webrtc.py
40
+ ```
41
+
42
+ Now, in the other computer, open a web browser with URL https://SERVER_IP:2700/, replacing `SERVER_IP` with the IP address of your Vosk server.
43
+
Demo-WebRTC/app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import json
4
+ import ssl
5
+ import os
6
+ import concurrent.futures
7
+ import asyncio
8
+ import sys
9
+
10
+ from pathlib import Path
11
+ from vosk import KaldiRecognizer, Model
12
+ from aiohttp import web
13
+ from aiortc import RTCSessionDescription, RTCPeerConnection
14
+ from av.audio.resampler import AudioResampler
15
+
16
+ ROOT = Path(__file__).parent
17
+
18
+ K_model_path = sys.argv[1]
19
+
20
+ vosk_interface = os.environ.get('VOSK_SERVER_INTERFACE', 'localhost')
21
+ vosk_port = int(os.environ.get('VOSK_SERVER_PORT', 8010))
22
+ vosk_model_path = os.environ.get('VOSK_MODEL_PATH', K_model_path) # /home/usertn2/Documents/Data/Models/TN_MODEL_V2.1
23
+ vosk_cert_file = os.environ.get('VOSK_CERT_FILE', None)
24
+ vosk_key_file = os.environ.get('VOSK_KEY_FILE', None)
25
+ vosk_dump_file = os.environ.get('VOSK_DUMP_FILE', None)
26
+
27
+ model = Model(vosk_model_path)
28
+ pool = concurrent.futures.ThreadPoolExecutor((os.cpu_count() or 1))
29
+ dump_fd = None if vosk_dump_file is None else open(vosk_dump_file, "wb")
30
+
31
+ def process_chunk(rec, message):
32
+ try:
33
+ res = rec.AcceptWaveform(message)
34
+ except Exception:
35
+ result = None
36
+ else:
37
+ if res > 0:
38
+ result = rec.Result()
39
+ else:
40
+ result = rec.PartialResult()
41
+ return result
42
+
43
+
44
+ class KaldiTask:
45
+ def __init__(self, user_connection):
46
+ self.__resampler = AudioResampler(format='s16', layout='mono', rate=16000)
47
+ self.__pc = user_connection
48
+ self.__audio_task = None
49
+ self.__track = None
50
+ self.__channel = None
51
+ self.__recognizer = KaldiRecognizer(model, 16000)
52
+
53
+
54
+ async def set_audio_track(self, track):
55
+ self.__track = track
56
+
57
+ async def set_text_channel(self, channel):
58
+ self.__channel = channel
59
+
60
+ async def start(self):
61
+ self.__audio_task = asyncio.create_task(self.__run_audio_xfer())
62
+
63
+ async def stop(self):
64
+ if self.__audio_task is not None:
65
+ self.__audio_task.cancel()
66
+ self.__audio_task = None
67
+
68
+ async def __run_audio_xfer(self):
69
+ loop = asyncio.get_running_loop()
70
+
71
+ max_frames = 20
72
+ frames = []
73
+ while True:
74
+ fr = await self.__track.recv()
75
+ frames.append(fr)
76
+
77
+ # We need to collect frames so we don't send partial results too often
78
+ if len(frames) < max_frames:
79
+ continue
80
+
81
+ dataframes = bytearray(b'')
82
+ for fr in frames:
83
+ for rfr in self.__resampler.resample(fr):
84
+ dataframes += bytes(rfr.planes[0])[:rfr.samples * 2]
85
+ frames.clear()
86
+
87
+ if dump_fd != None:
88
+ dump_fd.write(bytes(dataframes))
89
+
90
+ result = await loop.run_in_executor(pool, process_chunk, self.__recognizer, bytes(dataframes))
91
+ print(result)
92
+ self.__channel.send(result)
93
+
94
+ async def index(request):
95
+ content = open(str(ROOT / 'static' / 'index.html')).read()
96
+ return web.Response(content_type='text/html', text=content)
97
+
98
+
99
+ async def offer(request):
100
+
101
+ params = await request.json()
102
+ offer = RTCSessionDescription(
103
+ sdp=params['sdp'],
104
+ type=params['type'])
105
+
106
+ pc = RTCPeerConnection()
107
+
108
+ kaldi = KaldiTask(pc)
109
+
110
+ @pc.on('datachannel')
111
+ async def on_datachannel(channel):
112
+ channel.send('{}') # Dummy message to make the UI change to "Listening"
113
+ await kaldi.set_text_channel(channel)
114
+ await kaldi.start()
115
+
116
+ @pc.on('iceconnectionstatechange')
117
+ async def on_iceconnectionstatechange():
118
+ if pc.iceConnectionState == 'failed':
119
+ await pc.close()
120
+
121
+ @pc.on('track')
122
+ async def on_track(track):
123
+ if track.kind == 'audio':
124
+ await kaldi.set_audio_track(track)
125
+
126
+ @track.on('ended')
127
+ async def on_ended():
128
+ await kaldi.stop()
129
+
130
+ await pc.setRemoteDescription(offer)
131
+ answer = await pc.createAnswer()
132
+ await pc.setLocalDescription(answer)
133
+
134
+ return web.Response(
135
+ content_type='application/json',
136
+ text=json.dumps({
137
+ 'sdp': pc.localDescription.sdp,
138
+ 'type': pc.localDescription.type
139
+ }))
140
+
141
+
142
+ if __name__ == '__main__':
143
+
144
+ if vosk_cert_file:
145
+ ssl_context = ssl.SSLContext()
146
+ ssl_context.load_cert_chain(vosk_cert_file, vosk_key_file)
147
+ else:
148
+ ssl_context = None
149
+
150
+ app = web.Application()
151
+ app.router.add_post('/offer', offer)
152
+
153
+ app.router.add_get('/', index)
154
+ app.router.add_static('/static/', path=ROOT / 'static', name='static')
155
+
156
+ web.run_app(app, port=vosk_port, ssl_context=ssl_context)
Demo-WebRTC/static/client.js ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ var pc = null;
2
+ var dc = null, dcInterval = null;
3
+
4
+ var start_btn = document.getElementById('start');
5
+ var stop_btn = document.getElementById('stop');
6
+ var statusField = document.getElementById('status');
7
+
8
+ function btn_show_stop() {
9
+ start_btn.classList.add('d-none');
10
+ stop_btn.classList.remove('d-none');
11
+ }
12
+
13
+ function btn_show_start() {
14
+ stop_btn.classList.add('d-none');
15
+ start_btn.classList.remove('d-none');
16
+ statusField.innerText = 'Press start';
17
+ }
18
+
19
+ function negotiate() {
20
+ return pc.createOffer().then(function (offer) {
21
+ return pc.setLocalDescription(offer);
22
+ }).then(function () {
23
+ return new Promise(function (resolve) {
24
+ if (pc.iceGatheringState === 'complete') {
25
+ resolve();
26
+ } else {
27
+ function checkState() {
28
+ if (pc.iceGatheringState === 'complete') {
29
+ pc.removeEventListener('icegatheringstatechange', checkState);
30
+ resolve();
31
+ }
32
+ }
33
+ pc.addEventListener('icegatheringstatechange', checkState);
34
+ }
35
+ });
36
+ }).then(function () {
37
+ var offer = pc.localDescription;
38
+ console.log(offer.sdp);
39
+ return fetch('offer', {
40
+ body: JSON.stringify({
41
+ sdp: offer.sdp,
42
+ type: offer.type,
43
+ }),
44
+ headers: {
45
+ 'Content-Type': 'application/json'
46
+ },
47
+ method: 'POST'
48
+ });
49
+ }).then(function (response) {
50
+ return response.json();
51
+ }).then(function (answer) {
52
+ console.log(answer.sdp);
53
+ return pc.setRemoteDescription(answer);
54
+ }).catch(function (e) {
55
+ console.log(e);
56
+ btn_show_start();
57
+ });
58
+ }
59
+
60
+ function updateTranscriptionBox(text, isPartial) {
61
+ const transcriptionBox = document.getElementById('transcription-box');
62
+ let content = transcriptionBox.innerHTML;
63
+
64
+ // Remove the previous partial text if it exists and remove the old cursor
65
+ content = content.replace(/<span class="partial">.*?<\/span>/, '');
66
+ content = content.replace(/<div id="transcription-cursor"><\/div>/, '');
67
+
68
+ // Update content with the new text and add the blinking cursor
69
+ if (isPartial) {
70
+ content += `<span class="partial">${text}</span>`;
71
+ } else {
72
+ content += `${text} `;
73
+ }
74
+
75
+ // Add the blinking cursor at the end
76
+ content += '<div id="transcription-cursor"></div>';
77
+ transcriptionBox.innerHTML = content;
78
+
79
+ // Ensure the transcription box scrolls to the bottom as new text is added
80
+ transcriptionBox.scrollTop = transcriptionBox.scrollHeight;
81
+ }
82
+
83
+
84
+ function performRecvText(str) {
85
+ updateTranscriptionBox(str, false);
86
+ }
87
+
88
+ function performRecvPartial(str) {
89
+ updateTranscriptionBox(str, true);
90
+ }
91
+
92
+ function start() {
93
+ // Clear the transcription box
94
+ const transcriptionBox = document.getElementById('transcription-box');
95
+ transcriptionBox.innerHTML = '';
96
+
97
+ btn_show_stop();
98
+ statusField.innerText = 'Connecting...';
99
+ var config = {
100
+ sdpSemantics: 'unified-plan',
101
+ iceServers: [{urls: 'stun:stun.l.google.com:19302'}]
102
+ };
103
+
104
+ pc = new RTCPeerConnection(config);
105
+
106
+
107
+
108
+ dc = pc.createDataChannel('result');
109
+ dc.onclose = function () {
110
+ clearInterval(dcInterval);
111
+ console.log('Closed data channel');
112
+ btn_show_start();
113
+ };
114
+ dc.onopen = function () {
115
+ console.log('Opened data channel');
116
+ };
117
+ dc.onmessage = function (messageEvent) {
118
+ statusField.innerText = "Listening... say something";
119
+
120
+ if (!messageEvent.data) {
121
+ return;
122
+ }
123
+
124
+ let voskResult;
125
+ try {
126
+ voskResult = JSON.parse(messageEvent.data);
127
+ } catch (error) {
128
+ console.error(`ERROR: ${error.message}`);
129
+ return;
130
+ }
131
+ if ((voskResult.text?.length || 0) > 0) {
132
+ performRecvText(voskResult.text);
133
+ } else if ((voskResult.partial?.length || 0) > 0) {
134
+ performRecvPartial(voskResult.partial);
135
+ }
136
+ };
137
+
138
+ pc.oniceconnectionstatechange = function () {
139
+ if (pc.iceConnectionState == 'disconnected') {
140
+ console.log('Disconnected');
141
+ btn_show_start();
142
+ }
143
+ }
144
+
145
+ // var constraints = {
146
+ // audio: true,
147
+ // video: false,
148
+ // };
149
+
150
+ var audioConstraints = {
151
+ audio: {
152
+ sampleRate: 16000, // Optimize sample rate for speech
153
+ channelCount: 1, // Mono audio is sufficient for voice
154
+ echoCancellation: true,
155
+ noiseSuppression: true,
156
+ autoGainControl: true,
157
+ latency: 1
158
+ },
159
+ video: false
160
+ };
161
+
162
+ navigator.mediaDevices.getUserMedia(audioConstraints).then(function (stream) {
163
+ stream.getTracks().forEach(function (track) {
164
+ pc.addTrack(track, stream);
165
+ });
166
+ return negotiate();
167
+ }, function (err) {
168
+ console.log('Could not acquire media: ' + err);
169
+ btn_show_start();
170
+ });
171
+ }
172
+
173
+ function stop() {
174
+ // close data channel
175
+ if (dc) {
176
+ dc.close();
177
+ }
178
+
179
+ // close transceivers
180
+ if (pc.getTransceivers) {
181
+ pc.getTransceivers().forEach(function (transceiver) {
182
+ if (transceiver.stop) {
183
+ transceiver.stop();
184
+ }
185
+ });
186
+ }
187
+
188
+ // close local audio / video
189
+ pc.getSenders().forEach(function (sender) {
190
+ sender.track.stop();
191
+ });
192
+
193
+ // close peer connection
194
+ setTimeout(function () {
195
+ pc.close();
196
+ }, 500);
197
+ }
198
+
199
+ start_btn.addEventListener('click', start);
200
+ stop_btn.addEventListener('click', stop);
Demo-WebRTC/static/index.html ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title class="line-1 anim-typewriter">Tunisian STT</title>
7
+ <link rel="stylesheet" href="static/styles.css">
8
+ </head>
9
+ <body>
10
+ <div class="container">
11
+ <div class="header">
12
+ <img src="static/linagora.png" alt="Linagora" class="logo">
13
+ <h1>Tunisian STT <img src="static/tunisian_flag.png" alt="Tunisian Flag" class="flag-icon"> </h1>
14
+ </div>
15
+ <div id="transcription-box">قل ما عندك ...</div>
16
+ <button id="start" class="play-button"> Start</button>
17
+ <button id="stop" class="pause-button d-none">Stop</button>
18
+ <span id="status" class="text-uppercase text-muted">Press start</span>
19
+ </div>
20
+
21
+ <script src="static/client.js"></script>
22
+ </body>
23
+ </html>
Demo-WebRTC/static/linagora.png ADDED
Demo-WebRTC/static/styles.css ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @import url(https://fonts.googleapis.com/css?family=Anonymous+Pro);
2
+ @import url(http://fonts.googleapis.com/earlyaccess/droidarabickufi.css);
3
+
4
+ body {
5
+
6
+ margin: 0;
7
+ padding: 20px;
8
+ background-color: #000; /* Black background */
9
+ color: #fff; /* White text color */
10
+ }
11
+
12
+ .container {
13
+ max-width: 800px;
14
+ margin: 0 auto;
15
+ }
16
+
17
+ .header {
18
+ display: flex;
19
+ align-items: center;
20
+ justify-content: space-between;
21
+ margin-bottom: 20px;
22
+ }
23
+
24
+ .logo {
25
+ height: 40px;
26
+ }
27
+
28
+ h1 {
29
+ font-family: 'Anonymous Pro', monospace;
30
+ text-align: center;
31
+ margin: 0;
32
+ color: #cda5a5; /* Gray heading color */
33
+ vertical-align:auto; /* Aligns the flag vertically with the text */
34
+ }
35
+
36
+ .flag-icon {
37
+ width: 40px; /* Adjust the width as needed */
38
+ height: 27px;
39
+ vertical-align:text-top; /* Aligns the flag vertically with the text */
40
+ padding-top: 2px;
41
+ }
42
+
43
+
44
+ #transcription-box {
45
+ font-family: 'Droid Arabic Kufi';
46
+ position: relative;
47
+ height: 600px;
48
+ width: 100%;
49
+ color: #fff; /* White text color */
50
+ position: relative;
51
+ margin: 10% auto 0;
52
+ background: linear-gradient(0deg, #000, #272727);
53
+ margin-bottom: 15px;
54
+ border-radius: 23px;
55
+ direction: rtl;
56
+ text-align: right;
57
+ padding: 15px;
58
+ line-height: 30px;
59
+ font-size: 20px;
60
+ }
61
+
62
+ #transcription-box::before, #transcription-box::after {
63
+ content: '';
64
+ position: absolute;
65
+ left: -5px;
66
+ top: -5px;
67
+ border-radius: 25px;
68
+ background: linear-gradient(45deg,
69
+ #ff4545,
70
+ #e64949,
71
+ #9c4545,
72
+ #d85f5f,
73
+ #ff9393,
74
+ #f5a3a3,
75
+ #f3bbbb,
76
+ #f5bdbd,
77
+ #f52121,
78
+ #f5a1a1,
79
+ #ff4545,
80
+ #f7b2b2,
81
+ #f7e0e0);
82
+ background-size: 400%;
83
+ width: calc(100% + 10px);
84
+ height: calc(100% + 10px);
85
+ z-index: -1;
86
+ animation: anim-moving-glow 20s linear infinite;
87
+ }
88
+
89
+ @keyframes anim-moving-glow {
90
+ 0% {
91
+ background-position: 0 0;
92
+ }
93
+ 50% {
94
+ background-position: 400% 0;
95
+ }
96
+ 100% {
97
+ background-position: 0 0;
98
+ }
99
+ }
100
+
101
+ #transcription-box:after {
102
+ filter: blur(10px);
103
+ }
104
+
105
+ @keyframes spin {
106
+ 0% {
107
+ transform: rotate(0deg);
108
+ }
109
+ 100% {
110
+ transform: rotate(360deg);
111
+ }
112
+ }
113
+
114
+ button {
115
+ padding: 10px 20px;
116
+ font-size: 16px;
117
+ cursor: pointer;
118
+ color: #000; /* Black text on buttons */
119
+ background-color: #beadad; /* White background on buttons */
120
+ border: none;
121
+ border-radius: 10px;
122
+ transition: transform 0.3s ease, box-shadow 0.3s ease;
123
+ animation: pulse 2s infinite; /* Adding pulse animation */
124
+ }
125
+
126
+ button:hover {
127
+ transform: scale(1.1); /* Slightly enlarges button on hover */
128
+ box-shadow: 0 0 10px rgba(255, 0, 0, 0.7); /* Adds glowing effect */
129
+ }
130
+
131
+ @keyframes pulse {
132
+ 0% {
133
+ transform: scale(1);
134
+ box-shadow: 0 0 10px rgba(255, 0, 0, 0.5);
135
+ }
136
+ 50% {
137
+ transform: scale(1.05);
138
+ box-shadow: 0 0 20px rgba(255, 255, 255, 0.8);
139
+ }
140
+ 100% {
141
+ transform: scale(1);
142
+ box-shadow: 0 0 10px rgba(255, 0, 0, 0.925);
143
+ }
144
+ }
145
+ #transcription-cursor {
146
+ display: inline-block;
147
+ width: 1px;
148
+ height: 24px;
149
+ background-color: #fff; /* White color for the cursor */
150
+ animation: blink 1s step-end infinite;
151
+ vertical-align: bottom;
152
+ }
153
+
154
+ @keyframes blink {
155
+ from, to {
156
+ visibility: hidden;
157
+ }
158
+ 50% {
159
+ visibility: visible;
160
+ }
161
+ }
162
+
163
+ .partial {
164
+ color: #888;
165
+ font-style: italic;
166
+ }
167
+
168
+ #status {
169
+ font-size: 9pt;
170
+ margin-left: 10px;
171
+ }
172
+
173
+ .d-none {
174
+ display: none;
175
+ }
Demo-WebRTC/static/tunisian_flag.png ADDED
README.md CHANGED
@@ -1,3 +1,134 @@
1
  ---
2
  license: apache-2.0
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ language:
4
+ - ar
5
  ---
6
+
7
+ # LinTO-ASR-AR-TN-0.1
8
+
9
+ LinTO-ASR-AR-TN-0.1 is an Automatic Speech Recognition (ASR) model tailored for the Tunisian dialect, with additional support for code-switching, particularly for words in French and English. This repository includes two versions of the model:
10
+
11
+ Vosk Model: The original, comprehensive model.
12
+ Android Model: A lighter version with a simplified graph, optimized for deployment on Android devices or Raspberry Pi applications.
13
+
14
+ ## Model Overview
15
+
16
+ - **Model type**: Kaldi LinTO ASR
17
+ - **Language(s)**: Tunisian Dialect
18
+ - **Use cases**: Automatic Speech Recognition (ASR)
19
+
20
+ ## Downloading the Model
21
+
22
+ You can download the model and its components directly from this repository using one of the following methods:
23
+
24
+ ### Method 1: Direct Download via Browser
25
+
26
+ 1. **Visit the Repository**: Navigate to the [Hugging Face model page](https://huggingface.co/inagora/linto-asr-ar-tn-0.1/tree/main).
27
+ 2. **Download as Zip**: Click on the "Download" button or the "Code" button (often appearing as a dropdown). Select "Download ZIP" to get the entire repository as a zip file.
28
+
29
+ ### Method 2: Using `wget` command
30
+
31
+ You can follow the command below:
32
+
33
+ ```bash
34
+ sudo apt-get install wget
35
+
36
+ wget https://huggingface.co/inagora/linto-asr-ar-tn-0.1/tree/main/{model-name.zip}
37
+
38
+ ```
39
+ ### Method 3: Using a python code
40
+
41
+ You can use the following python code:
42
+
43
+ ```python
44
+ import requests
45
+
46
+ url = "https://huggingface.co/inagora/linto-asr-ar-tn-0.1/tree/main/{model-name.zip}"
47
+ response = requests.get(url)
48
+ with open(output_path, "wb") as file:
49
+ file.write(response.content)
50
+
51
+ print(f"Downloaded to {output_path}")
52
+ ```
53
+
54
+ ### Method 4: Cloning the Repository
55
+
56
+ You can clone the repository and create a zip file of the contents if needed:
57
+
58
+ ```bash
59
+
60
+ git clone https://huggingface.co/linagora/linto-asr-ar-tn-0.1.git
61
+
62
+ cd linto-asr-ar-tn-0.1
63
+
64
+ unzip <model-name.zip>
65
+
66
+ ```
67
+
68
+
69
+ ### Installation
70
+
71
+ First, make sure to install the required dependencies:
72
+
73
+ ```bash
74
+ pip install vosk
75
+ ```
76
+
77
+ ## How to Use
78
+ ```bash
79
+ python inference.py <path/to/your/model> <path/to/your/audio/file.wav>
80
+ ```
81
+
82
+ #### OR
83
+
84
+ ```python
85
+ from vosk import Model, KaldiRecognizer
86
+ import wave
87
+ import json
88
+
89
+ model_dir = "path/to/your/model"
90
+ audio_file = "path/to/your/audio/file.wav"
91
+
92
+ model = Model(model_dir)
93
+
94
+ with wave.open(audio_file, "rb") as wf:
95
+ if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
96
+ raise ValueError("Audio file must be WAV format mono PCM.")
97
+
98
+ rec = KaldiRecognizer(model, wf.getframerate())
99
+ rec.AcceptWaveform(wf.readframes(wf.getnframes()))
100
+ res = rec.FinalResult()
101
+ transcript = json.loads(res)["text"]
102
+ print(f"Transcript: {transcript}")
103
+ ```
104
+
105
+ ## Example Audio
106
+
107
+ Here is an example of the transcription capabilities of the model:
108
+
109
+ <audio controls>
110
+ <source src="https://huggingface.co/linagora/linto-asr-ar-tn-0.1/resolve/main/sample.wav" type="audio/wav">
111
+ </audio>
112
+
113
+ ## Result:
114
+ <p dir="rtl">
115
+ بالدعم هاذايا لي بثتهولو ال berd يعني أحنا حتى ال projet متاعو تقلب حتى sur le plan حتى فال management يا سيد نحنا في تسيير الشريكة يعني تبدل مية و ثمانين درجة ماللي يعني قبل ما تجيه ال berd و بعد ما جاتو ال berd برنامج نخصص لل les startup إسمو
116
+ </p>
117
+
118
+ ### WebRTC Demonstartion
119
+
120
+ ##### Intallation
121
+ ```bash
122
+ pip install vosk
123
+ pip install websockets
124
+ ```
125
+ ```bash
126
+
127
+ git clone https://huggingface.co/linagora/linto-asr-ar-tn-0.1.git
128
+
129
+ cd linto-asr-ar-tn-0.1/Demo-WebRTC
130
+
131
+ python3 app.py <model-path>
132
+
133
+ ```
134
+
android-model.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91811df61515172420c583bb901cd6c4c200a6e4fdf3437f539f5dcf851cbd29
3
+ size 165666389
inference.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import wave
4
+ import json
5
+ from vosk import Model, KaldiRecognizer
6
+
7
+ def load_model(model_dir):
8
+ model = Model(model_dir)
9
+ return model
10
+
11
+ def transcribe_audio(model, audio_file):
12
+ with wave.open(audio_file, "rb") as wf:
13
+ if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
14
+ raise ValueError("Audio file must be WAV format mono PCM.")
15
+
16
+ rec = KaldiRecognizer(model, wf.getframerate())
17
+ rec.AcceptWaveform(wf.readframes(wf.getnframes()))
18
+ res = rec.FinalResult()
19
+ result = json.loads(res)["text"]
20
+ return result
21
+
22
+ if __name__ == "__main__":
23
+ model_dir = sys.argv[1] # Replace with your model path
24
+ audio_file = sys.argv[2] # Replace with your audio file path
25
+
26
+ model = load_model(model_dir)
27
+ transcript = transcribe_audio(model, audio_file)
28
+ print(f"Transcript: {transcript}")
sample.wav ADDED
Binary file (480 kB). View file
 
vosk-model.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64e999b1bc27237e714ff6650fc7bd86d5ce10eb8d410c3495f46496da5d577d
3
+ size 541801652