Yurii Paniv commited on
Commit
e0a3506
1 Parent(s): 13aac28

Implement recording functionality

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. client.py +119 -0
  3. main.py +11 -3
  4. requirements.txt +2 -1
  5. templates/hello.html +65 -31
.gitignore CHANGED
@@ -127,3 +127,5 @@ dmypy.json
127
 
128
  # Pyre type checker
129
  .pyre/
 
 
 
127
 
128
  # Pyre type checker
129
  .pyre/
130
+
131
+ *.tflite
client.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ from __future__ import absolute_import, division, print_function
4
+
5
+ import argparse
6
+ import numpy as np
7
+ import shlex
8
+ import subprocess
9
+ import sys
10
+ import wave
11
+ import json
12
+
13
+ from deepspeech import Model, version
14
+ from timeit import default_timer as timer
15
+
16
+ try:
17
+ from shhlex import quote
18
+ except ImportError:
19
+ from pipes import quote
20
+
21
+
22
+ def convert_samplerate(audio_path, desired_sample_rate):
23
+ sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(
24
+ quote(audio_path), desired_sample_rate)
25
+ try:
26
+ output = subprocess.check_output(
27
+ shlex.split(sox_cmd), stderr=subprocess.PIPE)
28
+ except subprocess.CalledProcessError as e:
29
+ raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
30
+ except OSError as e:
31
+ raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(
32
+ desired_sample_rate, e.strerror))
33
+
34
+ return desired_sample_rate, np.frombuffer(output, np.int16)
35
+
36
+
37
+ def metadata_to_string(metadata):
38
+ return ''.join(token.text for token in metadata.tokens)
39
+
40
+
41
+ def words_from_candidate_transcript(metadata):
42
+ word = ""
43
+ word_list = []
44
+ word_start_time = 0
45
+ # Loop through each character
46
+ for i, token in enumerate(metadata.tokens):
47
+ # Append character to word if it's not a space
48
+ if token.text != " ":
49
+ if len(word) == 0:
50
+ # Log the start time of the new word
51
+ word_start_time = token.start_time
52
+
53
+ word = word + token.text
54
+ # Word boundary is either a space or the last character in the array
55
+ if token.text == " " or i == len(metadata.tokens) - 1:
56
+ word_duration = token.start_time - word_start_time
57
+
58
+ if word_duration < 0:
59
+ word_duration = 0
60
+
61
+ each_word = dict()
62
+ each_word["word"] = word
63
+ each_word["start_time "] = round(word_start_time, 4)
64
+ each_word["duration"] = round(word_duration, 4)
65
+
66
+ word_list.append(each_word)
67
+ # Reset
68
+ word = ""
69
+ word_start_time = 0
70
+
71
+ return word_list
72
+
73
+
74
+ def metadata_json_output(metadata):
75
+ json_result = dict()
76
+ json_result["transcripts"] = [{
77
+ "confidence": transcript.confidence,
78
+ "words": words_from_candidate_transcript(transcript),
79
+ } for transcript in metadata.transcripts]
80
+ return json.dumps(json_result, indent=2)
81
+
82
+
83
+ class VersionAction(argparse.Action):
84
+ def __init__(self, *args, **kwargs):
85
+ super(VersionAction, self).__init__(nargs=0, *args, **kwargs)
86
+
87
+ def __call__(self, *args, **kwargs):
88
+ print('DeepSpeech ', version())
89
+ exit(0)
90
+
91
+
92
+ def client(audio_file):
93
+ model_load_start = timer()
94
+ # sphinx-doc: python_ref_model_start
95
+ ds = Model("./uk.tflite")
96
+ # sphinx-doc: python_ref_model_stop
97
+ model_load_end = timer() - model_load_start
98
+ print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
99
+
100
+ desired_sample_rate = ds.sampleRate()
101
+
102
+ fin = wave.open(audio_file, 'rb')
103
+ fs_orig = fin.getframerate()
104
+ audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
105
+
106
+ audio_length = fin.getnframes() * (1/fs_orig)
107
+ fin.close()
108
+
109
+ print('Running inference.', file=sys.stderr)
110
+ inference_start = timer()
111
+ # sphinx-doc: python_ref_inference_start
112
+
113
+ result = ds.stt(audio)
114
+ print(result)
115
+ # sphinx-doc: python_ref_inference_stop
116
+ inference_end = timer() - inference_start
117
+ print('Inference took %0.3fs for %0.3fs audio file.' %
118
+ (inference_end, audio_length), file=sys.stderr)
119
+ return result
main.py CHANGED
@@ -1,5 +1,8 @@
1
- from flask import Flask, render_template
2
- app = Flask(__name__)
 
 
 
3
 
4
 
5
  @app.route('/')
@@ -9,4 +12,9 @@ def index():
9
 
10
  @app.route('/recognize', methods=["POST"])
11
  def recognize():
12
- return 'Hello, World!'
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request
2
+ from io import BytesIO
3
+ from client import client
4
+
5
+ app = Flask(__name__,)
6
 
7
 
8
  @app.route('/')
 
12
 
13
  @app.route('/recognize', methods=["POST"])
14
  def recognize():
15
+ file = request.files['file']
16
+ audio = BytesIO()
17
+ file.save(audio)
18
+ audio.seek(0)
19
+ result = client(audio)
20
+ return result
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  Flask==1.1.2
2
- deepspeech-tflite==0.7.3
 
 
1
  Flask==1.1.2
2
+ deepspeech-tflite==0.7.3
3
+ numpy==1.17.0
templates/hello.html CHANGED
@@ -13,49 +13,83 @@
13
  <h1>Audio Recording Test</h1>
14
  <p>Talk for 3 seconds, then you will hear your recording played back</p>
15
  <button class="btn btn-primary" id="action" onclick="handleAction()">Start recording...</button>
 
 
16
  <script>
17
- const recordAudio = () =>
18
- new Promise(async resolve => {
19
- const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
20
- const mediaRecorder = new MediaRecorder(stream, { audioBitsPerSecond: 16000 });
21
- const audioChunks = [];
22
 
23
- mediaRecorder.addEventListener("dataavailable", event => {
24
- audioChunks.push(event.data);
25
- });
 
26
 
27
- const start = () => mediaRecorder.start();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- const stop = () =>
30
- new Promise(resolve => {
31
- mediaRecorder.addEventListener("stop", () => {
32
- const audioBlob = new Blob(audioChunks);
33
 
34
- const audioUrl = URL.createObjectURL(audioBlob);
35
- fetch(`./recognize`, { method: "POST", body: audioBlob })
36
- .then(response => console.log(response.text()))
37
- const audio = new Audio(audioUrl);
38
- const play = () => audio.play();
39
- resolve({ audioBlob, audioUrl, play });
40
- });
41
 
42
- mediaRecorder.stop();
43
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- resolve({ start, stop });
46
- });
47
 
48
  const sleep = time => new Promise(resolve => setTimeout(resolve, time));
49
 
50
- const handleAction = async () => {
51
- const recorder = await recordAudio();
52
  const actionButton = document.getElementById('action');
53
  actionButton.disabled = true;
54
- recorder.start();
55
- await sleep(3000);
56
- const audio = await recorder.stop();
57
- audio.play();
58
- await sleep(3000);
59
  actionButton.disabled = false;
60
  }
61
  </script>
 
13
  <h1>Audio Recording Test</h1>
14
  <p>Talk for 3 seconds, then you will hear your recording played back</p>
15
  <button class="btn btn-primary" id="action" onclick="handleAction()">Start recording...</button>
16
+ <div id="result"></div>
17
+ <script src="https://cdn.rawgit.com/mattdiamond/Recorderjs/08e7abd9/dist/recorder.js"></script>
18
  <script>
19
+ var gumStream; //stream from getUserMedia()
20
+ var rec; //Recorder.js object
21
+ var input; //MediaStreamAudioSourceNode we'll be recording
 
 
22
 
23
+ // shim for AudioContext when it's not avb.
24
+ var AudioContext = window.AudioContext || window.webkitAudioContext;
25
+ var audioContext; //audio context to help us record
26
+ var resultNode = document.getElementById('result');
27
 
28
+ function resultProcess(data) {
29
+ resultNode.textContent = `Довжина тексту: ${data.length} \n
30
+ Текст: ${data}
31
+ `
32
+ }
33
+
34
+ function exportWAV(blob) {
35
+ var data = new FormData()
36
+ data.append('file', blob);
37
+ fetch(`./recognize`, { method: "POST", body: data })
38
+ .then(response => response.text())
39
+ .then(resultProcess);
40
+ }
41
+ function record() {
42
+ var constraints = { audio: true, video: false }
43
+ navigator.mediaDevices.getUserMedia(constraints).then(function (stream) {
44
+ console.log("getUserMedia() success, stream created, initializing Recorder.js ...");
45
+
46
+ /*
47
+ create an audio context after getUserMedia is called
48
+ sampleRate might change after getUserMedia is called, like it does on macOS when recording through AirPods
49
+ the sampleRate defaults to the one set in your OS for your playback device
50
+ */
51
+ audioContext = new AudioContext();
52
 
53
+ /* assign to gumStream for later use */
54
+ gumStream = stream;
 
 
55
 
56
+ /* use the stream */
57
+ input = audioContext.createMediaStreamSource(stream);
 
 
 
 
 
58
 
59
+ /*
60
+ Create the Recorder object and configure to record mono sound (1 channel)
61
+ Recording 2 channels will double the file size
62
+ */
63
+ rec = new Recorder(input, { numChannels: 1 })
64
+
65
+ //start the recording process
66
+ rec.record()
67
+
68
+ console.log("Recording started");
69
+ sleep(3000).then(stop);
70
+ })
71
+ }
72
+
73
+
74
+ function stop() {
75
+ rec.stop();
76
+
77
+ //stop microphone access
78
+ gumStream.getAudioTracks()[0].stop();
79
+
80
+ //create the wav blob and pass it on to createDownloadLink
81
+ rec.exportWAV(exportWAV);
82
+ console.log("Recording stopped")
83
+
84
+ }
85
 
 
 
86
 
87
  const sleep = time => new Promise(resolve => setTimeout(resolve, time));
88
 
89
+ async function handleAction() {
 
90
  const actionButton = document.getElementById('action');
91
  actionButton.disabled = true;
92
+ record();
 
 
 
 
93
  actionButton.disabled = false;
94
  }
95
  </script>