upload the model, WebRTC app and the inference script

Browse files

Files changed (12) hide show

Demo-WebRTC/README.md +43 -0
Demo-WebRTC/app.py +156 -0
Demo-WebRTC/static/client.js +200 -0
Demo-WebRTC/static/index.html +23 -0
Demo-WebRTC/static/linagora.png +0 -0
Demo-WebRTC/static/styles.css +175 -0
Demo-WebRTC/static/tunisian_flag.png +0 -0
README.md +131 -0
android-model.zip +3 -0
inference.py +28 -0
sample.wav +0 -0
vosk-model.zip +3 -0

Demo-WebRTC/README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+# webrtc vosk-server
+## Setup environment and run it.
+### Set model path
+Setup path to ./model
+The models can be download from here https://alphacephei.com/vosk/models
+### Python environment
+The sample can work in python 3.8
+```sh
+$ python3 -m pip install aiortc aiohttp aiorpc vosk
+```
+If your system fails installing aiortc, please install gcc in your environment and use pip to install aiortc again.
+### Execution in local
+Run the server:
+```sh
+$ python3 asr_server_webrtc.py
+```
+Now, open a web browser with URL http://localhost:2700/.
+### Execution in LAN
+To test the demo from another computer on the LAN, the web page must be served through HTTPS. This is because modern web browsers (such as Chrome, Firefox) don't allow access to the microphone unless the host is `localhost` or the page is served securely.
+Thus, an SSL certificate is required to test the demo from other computers or smartphones. An untrusted self-signed certificate will work fine on most browsers (iOS Safari is the exception). You can use [mkcert](https://github.com/FiloSottile/mkcert) to make your own self-signed *cert* and *key* files.
+```sh
+$ export VOSK_CERT_FILE="/path/to/cert.pem"
+$ export VOSK_KEY_FILE="/path/to/key.pem"
+$ python3 asr_server_webrtc.py
+```
+Now, in the other computer, open a web browser with URL https://SERVER_IP:2700/, replacing `SERVER_IP` with the IP address of your Vosk server.

Demo-WebRTC/app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+#!/usr/bin/env python3
+import json
+import ssl
+import os
+import concurrent.futures
+import asyncio
+import sys
+from pathlib import Path
+from vosk import KaldiRecognizer, Model
+from aiohttp import web
+from aiortc import RTCSessionDescription, RTCPeerConnection
+from av.audio.resampler import AudioResampler
+ROOT = Path(__file__).parent
+K_model_path = sys.argv[1]
+vosk_interface = os.environ.get('VOSK_SERVER_INTERFACE', 'localhost')
+vosk_port = int(os.environ.get('VOSK_SERVER_PORT', 8010))
+vosk_model_path = os.environ.get('VOSK_MODEL_PATH', K_model_path) # /home/usertn2/Documents/Data/Models/TN_MODEL_V2.1
+vosk_cert_file = os.environ.get('VOSK_CERT_FILE', None)
+vosk_key_file = os.environ.get('VOSK_KEY_FILE', None)
+vosk_dump_file = os.environ.get('VOSK_DUMP_FILE', None)
+model = Model(vosk_model_path)
+pool = concurrent.futures.ThreadPoolExecutor((os.cpu_count() or 1))
+dump_fd = None if vosk_dump_file is None else open(vosk_dump_file, "wb")
+def process_chunk(rec, message):
+    try:
+        res = rec.AcceptWaveform(message)
+    except Exception:
+        result = None
+    else:
+        if res > 0:
+            result = rec.Result()
+        else:
+            result = rec.PartialResult()
+    return result
+class KaldiTask:
+    def __init__(self, user_connection):
+        self.__resampler = AudioResampler(format='s16', layout='mono', rate=16000)
+        self.__pc = user_connection
+        self.__audio_task = None
+        self.__track = None
+        self.__channel = None
+        self.__recognizer = KaldiRecognizer(model, 16000)
+    async def set_audio_track(self, track):
+        self.__track = track
+    async def set_text_channel(self, channel):
+        self.__channel = channel
+    async def start(self):
+        self.__audio_task = asyncio.create_task(self.__run_audio_xfer())
+    async def stop(self):
+        if self.__audio_task is not None:
+            self.__audio_task.cancel()
+            self.__audio_task = None
+    async def __run_audio_xfer(self):
+        loop = asyncio.get_running_loop()
+        max_frames = 20
+        frames = []
+        while True:
+            fr = await self.__track.recv()
+            frames.append(fr)
+            # We need to collect frames so we don't send partial results too often
+            if len(frames) < max_frames:
+               continue
+            dataframes = bytearray(b'')
+            for fr in frames:
+                for rfr in self.__resampler.resample(fr):
+                    dataframes += bytes(rfr.planes[0])[:rfr.samples * 2]
+            frames.clear()
+            if dump_fd != None:
+                dump_fd.write(bytes(dataframes))
+            result = await loop.run_in_executor(pool, process_chunk, self.__recognizer, bytes(dataframes))
+            print(result)
+            self.__channel.send(result)
+async def index(request):
+    content = open(str(ROOT / 'static' / 'index.html')).read()
+    return web.Response(content_type='text/html', text=content)
+async def offer(request):
+    params = await request.json()
+    offer = RTCSessionDescription(
+        sdp=params['sdp'],
+        type=params['type'])
+    pc = RTCPeerConnection()
+    kaldi = KaldiTask(pc)
+    @pc.on('datachannel')
+    async def on_datachannel(channel):
+        channel.send('{}') # Dummy message to make the UI change to "Listening"
+        await kaldi.set_text_channel(channel)
+        await kaldi.start()
+    @pc.on('iceconnectionstatechange')
+    async def on_iceconnectionstatechange():
+        if pc.iceConnectionState == 'failed':
+            await pc.close()
+    @pc.on('track')
+    async def on_track(track):
+        if track.kind == 'audio':
+            await kaldi.set_audio_track(track)
+        @track.on('ended')
+        async def on_ended():
+            await kaldi.stop()
+    await pc.setRemoteDescription(offer)
+    answer = await pc.createAnswer()
+    await pc.setLocalDescription(answer)
+    return web.Response(
+        content_type='application/json',
+        text=json.dumps({
+            'sdp': pc.localDescription.sdp,
+            'type': pc.localDescription.type
+        }))
+if __name__ == '__main__':
+    if vosk_cert_file:
+        ssl_context = ssl.SSLContext()
+        ssl_context.load_cert_chain(vosk_cert_file, vosk_key_file)
+    else:
+        ssl_context = None
+    app = web.Application()
+    app.router.add_post('/offer', offer)
+    app.router.add_get('/', index)
+    app.router.add_static('/static/', path=ROOT / 'static', name='static')
+    web.run_app(app, port=vosk_port, ssl_context=ssl_context)

Demo-WebRTC/static/client.js ADDED Viewed

	@@ -0,0 +1,200 @@

+var pc = null;
+var dc = null, dcInterval = null;
+var start_btn = document.getElementById('start');
+var stop_btn = document.getElementById('stop');
+var statusField = document.getElementById('status');
+function btn_show_stop() {
+    start_btn.classList.add('d-none');
+    stop_btn.classList.remove('d-none');
+}
+function btn_show_start() {
+    stop_btn.classList.add('d-none');
+    start_btn.classList.remove('d-none');
+    statusField.innerText = 'Press start';
+}
+function negotiate() {
+    return pc.createOffer().then(function (offer) {
+        return pc.setLocalDescription(offer);
+    }).then(function () {
+        return new Promise(function (resolve) {
+            if (pc.iceGatheringState === 'complete') {
+                resolve();
+            } else {
+                function checkState() {
+                    if (pc.iceGatheringState === 'complete') {
+                        pc.removeEventListener('icegatheringstatechange', checkState);
+                        resolve();
+                    }
+                }
+                pc.addEventListener('icegatheringstatechange', checkState);
+            }
+        });
+    }).then(function () {
+        var offer = pc.localDescription;
+        console.log(offer.sdp);
+        return fetch('offer', {
+            body: JSON.stringify({
+                sdp: offer.sdp,
+                type: offer.type,
+            }),
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            method: 'POST'
+        });
+    }).then(function (response) {
+        return response.json();
+    }).then(function (answer) {
+        console.log(answer.sdp);
+        return pc.setRemoteDescription(answer);
+    }).catch(function (e) {
+        console.log(e);
+        btn_show_start();
+    });
+}
+function updateTranscriptionBox(text, isPartial) {
+    const transcriptionBox = document.getElementById('transcription-box');
+    let content = transcriptionBox.innerHTML;
+    // Remove the previous partial text if it exists and remove the old cursor
+    content = content.replace(/<span class="partial">.*?<\/span>/, '');
+    content = content.replace(/<div id="transcription-cursor"><\/div>/, '');
+    // Update content with the new text and add the blinking cursor
+    if (isPartial) {
+        content += `<span class="partial">${text}</span>`;
+    } else {
+        content += `${text} `;
+    }
+    // Add the blinking cursor at the end
+    content += '<div id="transcription-cursor"></div>';
+    transcriptionBox.innerHTML = content;
+    // Ensure the transcription box scrolls to the bottom as new text is added
+    transcriptionBox.scrollTop = transcriptionBox.scrollHeight;
+}
+function performRecvText(str) {
+    updateTranscriptionBox(str, false);
+}
+function performRecvPartial(str) {
+    updateTranscriptionBox(str, true);
+}
+function start() {
+    // Clear the transcription box
+    const transcriptionBox = document.getElementById('transcription-box');
+    transcriptionBox.innerHTML = '';
+    btn_show_stop();
+    statusField.innerText = 'Connecting...';
+    var config = {
+        sdpSemantics: 'unified-plan',
+        iceServers: [{urls: 'stun:stun.l.google.com:19302'}]
+    };
+    pc = new RTCPeerConnection(config);
+    dc = pc.createDataChannel('result');
+    dc.onclose = function () {
+        clearInterval(dcInterval);
+        console.log('Closed data channel');
+        btn_show_start();
+    };
+    dc.onopen = function () {
+        console.log('Opened data channel');
+    };
+    dc.onmessage = function (messageEvent) {
+        statusField.innerText = "Listening... say something";
+        if (!messageEvent.data) {
+            return;
+        }
+        let voskResult;
+        try {
+            voskResult = JSON.parse(messageEvent.data);
+        } catch (error) {
+            console.error(`ERROR: ${error.message}`);
+            return;
+        }
+        if ((voskResult.text?.length || 0) > 0) {
+            performRecvText(voskResult.text);
+        } else if ((voskResult.partial?.length || 0) > 0) {
+            performRecvPartial(voskResult.partial);
+        }
+    };
+    pc.oniceconnectionstatechange = function () {
+        if (pc.iceConnectionState == 'disconnected') {
+            console.log('Disconnected');
+            btn_show_start();
+        }
+    }
+    // var constraints = {
+    //     audio: true,
+    //     video: false,
+    // };
+    var audioConstraints = {
+        audio: {
+            sampleRate: 16000, // Optimize sample rate for speech
+            channelCount: 1,   // Mono audio is sufficient for voice
+            echoCancellation: true,
+            noiseSuppression: true,
+            autoGainControl: true,
+            latency: 1
+        },
+        video: false
+    };
+    navigator.mediaDevices.getUserMedia(audioConstraints).then(function (stream) {
+        stream.getTracks().forEach(function (track) {
+            pc.addTrack(track, stream);
+        });
+        return negotiate();
+    }, function (err) {
+        console.log('Could not acquire media: ' + err);
+        btn_show_start();
+    });
+}
+function stop() {
+    // close data channel
+    if (dc) {
+        dc.close();
+    }
+    // close transceivers
+    if (pc.getTransceivers) {
+        pc.getTransceivers().forEach(function (transceiver) {
+            if (transceiver.stop) {
+                transceiver.stop();
+            }
+        });
+    }
+    // close local audio / video
+    pc.getSenders().forEach(function (sender) {
+        sender.track.stop();
+    });
+    // close peer connection
+    setTimeout(function () {
+        pc.close();
+    }, 500);
+}
+start_btn.addEventListener('click', start);
+stop_btn.addEventListener('click', stop);

Demo-WebRTC/static/index.html ADDED Viewed

	@@ -0,0 +1,23 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title class="line-1 anim-typewriter">Tunisian STT</title>
+    <link rel="stylesheet" href="static/styles.css">
+</head>
+<body>
+    <div class="container">
+        <div class="header">
+            <img src="static/linagora.png" alt="Linagora" class="logo">
+            <h1>Tunisian STT <img src="static/tunisian_flag.png" alt="Tunisian Flag" class="flag-icon"> </h1>
+        </div>
+        <div id="transcription-box">قل ما عندك ...</div>
+        <button id="start" class="play-button"> Start</button>
+        <button id="stop" class="pause-button d-none">Stop</button>
+        <span id="status" class="text-uppercase text-muted">Press start</span>
+    </div>
+    <script src="static/client.js"></script>
+</body>
+</html>

Demo-WebRTC/static/linagora.png ADDED Viewed

Demo-WebRTC/static/styles.css ADDED Viewed

	@@ -0,0 +1,175 @@

+@import url(https://fonts.googleapis.com/css?family=Anonymous+Pro);
+@import url(http://fonts.googleapis.com/earlyaccess/droidarabickufi.css);
+body {
+  margin: 0;
+  padding: 20px;
+  background-color: #000; /* Black background */
+  color: #fff; /* White text color */
+}
+.container {
+  max-width: 800px;
+  margin: 0 auto;
+}
+.header {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  margin-bottom: 20px;
+}
+.logo {
+  height: 40px;
+}
+h1 {
+  font-family: 'Anonymous Pro', monospace;
+  text-align: center;
+  margin: 0;
+  color: #cda5a5; /* Gray heading color */
+  vertical-align:auto;  /* Aligns the flag vertically with the text */
+}
+.flag-icon {
+  width: 40px;  /* Adjust the width as needed */
+  height: 27px;
+  vertical-align:text-top;  /* Aligns the flag vertically with the text */
+  padding-top: 2px;
+}
+#transcription-box {
+  font-family: 'Droid Arabic Kufi';
+  position: relative;
+  height: 600px;
+  width: 100%;
+  color: #fff; /* White text color */
+  position: relative;
+  margin: 10% auto 0;
+  background: linear-gradient(0deg, #000, #272727);
+  margin-bottom: 15px;
+  border-radius: 23px;
+  direction: rtl;
+  text-align: right;
+  padding: 15px;
+  line-height: 30px;
+  font-size: 20px;
+}
+#transcription-box::before, #transcription-box::after {
+    content: '';
+    position: absolute;
+    left: -5px;
+    top: -5px;
+    border-radius: 25px;
+    background: linear-gradient(45deg,
+    #ff4545,
+    #e64949,
+    #9c4545,
+    #d85f5f,
+    #ff9393,
+    #f5a3a3,
+    #f3bbbb,
+    #f5bdbd,
+    #f52121,
+    #f5a1a1,
+    #ff4545,
+    #f7b2b2,
+    #f7e0e0);
+    background-size: 400%;
+    width: calc(100% + 10px);
+    height: calc(100% + 10px);
+    z-index: -1;
+    animation: anim-moving-glow 20s linear infinite;
+}
+@keyframes anim-moving-glow {
+    0% {
+        background-position: 0 0;
+    }
+    50% {
+        background-position: 400% 0;
+    }
+    100% {
+        background-position: 0 0;
+    }
+}
+#transcription-box:after {
+    filter: blur(10px);
+}
+@keyframes spin {
+  0% {
+    transform: rotate(0deg);
+  }
+  100% {
+    transform: rotate(360deg);
+  }
+}
+button {
+  padding: 10px 20px;
+  font-size: 16px;
+  cursor: pointer;
+  color: #000; /* Black text on buttons */
+  background-color: #beadad; /* White background on buttons */
+  border: none;
+  border-radius: 10px;
+  transition: transform 0.3s ease, box-shadow 0.3s ease;
+  animation: pulse 2s infinite; /* Adding pulse animation */
+}
+button:hover {
+  transform: scale(1.1); /* Slightly enlarges button on hover */
+  box-shadow: 0 0 10px rgba(255, 0, 0, 0.7); /* Adds glowing effect */
+}
+@keyframes pulse {
+  0% {
+    transform: scale(1);
+    box-shadow: 0 0 10px rgba(255, 0, 0, 0.5);
+  }
+  50% {
+    transform: scale(1.05);
+    box-shadow: 0 0 20px rgba(255, 255, 255, 0.8);
+  }
+  100% {
+    transform: scale(1);
+    box-shadow: 0 0 10px rgba(255, 0, 0, 0.925);
+  }
+}
+#transcription-cursor {
+  display: inline-block;
+  width: 1px;
+  height: 24px;
+  background-color: #fff; /* White color for the cursor */
+  animation: blink 1s step-end infinite;
+  vertical-align: bottom;
+}
+@keyframes blink {
+  from, to {
+      visibility: hidden;
+  }
+  50% {
+      visibility: visible;
+  }
+}
+.partial {
+  color: #888;
+  font-style: italic;
+}
+#status {
+  font-size: 9pt;
+  margin-left: 10px;
+}
+.d-none {
+  display: none;
+}

Demo-WebRTC/static/tunisian_flag.png ADDED Viewed

README.md CHANGED Viewed

@@ -1,3 +1,134 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+language:
+- ar
 ---
+# LinTO-ASR-AR-TN-0.1
+LinTO-ASR-AR-TN-0.1 is an Automatic Speech Recognition (ASR) model tailored for the Tunisian dialect, with additional support for code-switching, particularly for words in French and English. This repository includes two versions of the model:
+Vosk Model: The original, comprehensive model.
+Android Model: A lighter version with a simplified graph, optimized for deployment on Android devices or Raspberry Pi applications.
+## Model Overview
+- **Model type**: Kaldi LinTO ASR
+- **Language(s)**: Tunisian Dialect
+- **Use cases**: Automatic Speech Recognition (ASR)
+## Downloading the Model
+You can download the model and its components directly from this repository using one of the following methods:
+### Method 1: Direct Download via Browser
+1. **Visit the Repository**: Navigate to the [Hugging Face model page](https://huggingface.co/inagora/linto-asr-ar-tn-0.1/tree/main).
+2. **Download as Zip**: Click on the "Download" button or the "Code" button (often appearing as a dropdown). Select "Download ZIP" to get the entire repository as a zip file.
+### Method 2: Using `wget` command
+You can follow the command below:
+```bash
+sudo apt-get install wget
+wget https://huggingface.co/inagora/linto-asr-ar-tn-0.1/tree/main/{model-name.zip}
+```
+### Method 3: Using a python code
+You can use the following python code:
+```python
+import requests
+url = "https://huggingface.co/inagora/linto-asr-ar-tn-0.1/tree/main/{model-name.zip}"
+response = requests.get(url)
+with open(output_path, "wb") as file:
+    file.write(response.content)
+print(f"Downloaded to {output_path}")
+```
+### Method 4: Cloning the Repository
+You can clone the repository and create a zip file of the contents if needed:
+```bash
+git clone https://huggingface.co/linagora/linto-asr-ar-tn-0.1.git
+cd linto-asr-ar-tn-0.1
+unzip <model-name.zip>
+```
+### Installation
+First, make sure to install the required dependencies:
+```bash
+pip install vosk
+```
+## How to Use
+```bash
+python inference.py <path/to/your/model> <path/to/your/audio/file.wav>
+```
+#### OR
+```python
+from vosk import Model, KaldiRecognizer
+import wave
+import json
+model_dir = "path/to/your/model"
+audio_file = "path/to/your/audio/file.wav"
+model = Model(model_dir)
+with wave.open(audio_file, "rb") as wf:
+    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
+        raise ValueError("Audio file must be WAV format mono PCM.")
+    rec = KaldiRecognizer(model, wf.getframerate())
+    rec.AcceptWaveform(wf.readframes(wf.getnframes()))
+    res = rec.FinalResult()
+    transcript = json.loads(res)["text"]
+print(f"Transcript: {transcript}")
+```
+## Example Audio
+Here is an example of the transcription capabilities of the model:
+<audio controls>
+  <source src="https://huggingface.co/linagora/linto-asr-ar-tn-0.1/resolve/main/sample.wav" type="audio/wav">
+</audio>
+## Result:
+<p dir="rtl">
+بالدعم هاذايا لي بثتهولو ال berd يعني أحنا حتى ال projet متاعو تقلب حتى sur le plan حتى فال management يا سيد نحنا في تسيير الشريكة يعني تبدل مية و ثمانين درجة ماللي يعني قبل ما تجيه ال berd و بعد ما جاتو ال berd برنامج نخصص لل les startup إسمو
+</p>
+### WebRTC Demonstartion
+##### Intallation
+```bash
+pip install vosk
+pip install websockets
+```
+```bash
+git clone https://huggingface.co/linagora/linto-asr-ar-tn-0.1.git
+cd linto-asr-ar-tn-0.1/Demo-WebRTC
+python3 app.py <model-path>
+```

android-model.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91811df61515172420c583bb901cd6c4c200a6e4fdf3437f539f5dcf851cbd29
+size 165666389

inference.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+import sys
+import wave
+import json
+from vosk import Model, KaldiRecognizer
+def load_model(model_dir):
+    model = Model(model_dir)
+    return model
+def transcribe_audio(model, audio_file):
+    with wave.open(audio_file, "rb") as wf:
+        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
+            raise ValueError("Audio file must be WAV format mono PCM.")
+        rec = KaldiRecognizer(model, wf.getframerate())
+        rec.AcceptWaveform(wf.readframes(wf.getnframes()))
+        res = rec.FinalResult()
+        result = json.loads(res)["text"]
+        return result
+if __name__ == "__main__":
+    model_dir = sys.argv[1]  # Replace with your model path
+    audio_file = sys.argv[2]  # Replace with your audio file path
+    model = load_model(model_dir)
+    transcript = transcribe_audio(model, audio_file)
+    print(f"Transcript: {transcript}")

sample.wav ADDED Viewed

Binary file (480 kB). View file

vosk-model.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64e999b1bc27237e714ff6650fc7bd86d5ce10eb8d410c3495f46496da5d577d
+size 541801652