Mohabedalgani asafaya commited on
Commit
fea5bdc
0 Parent(s):

Duplicate from asafaya/arabic-audio-transcription

Browse files

Co-authored-by: Ali Safaya <asafaya@users.noreply.huggingface.co>

Files changed (5) hide show
  1. .gitattributes +35 -0
  2. README.md +47 -0
  3. app.py +170 -0
  4. output.wav +3 -0
  5. requirements.txt +4 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
25
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
27
+ *.tflite filter=lfs diff=lfs merge=lfs -text
28
+ *.tgz filter=lfs diff=lfs merge=lfs -text
29
+ *.wasm filter=lfs diff=lfs merge=lfs -text
30
+ *.xz filter=lfs diff=lfs merge=lfs -text
31
+ *.zip filter=lfs diff=lfs merge=lfs -text
32
+ *.zst filter=lfs diff=lfs merge=lfs -text
33
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
35
+ output.wav filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Arabic Audio Transcription
3
+ emoji: 🎙️
4
+ colorFrom: blue
5
+ colorTo: blue
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ license: cc-by-nc-4.0
10
+ duplicated_from: asafaya/arabic-audio-transcription
11
+ ---
12
+
13
+ # Configuration
14
+
15
+ `title`: _string_
16
+ Display title for the Space
17
+
18
+ `emoji`: _string_
19
+ Space emoji (emoji-only character allowed)
20
+
21
+ `colorFrom`: _string_
22
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
23
+
24
+ `colorTo`: _string_
25
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
26
+
27
+ `sdk`: _string_
28
+ Can be either `gradio`, `streamlit`, or `static`
29
+
30
+ `sdk_version` : _string_
31
+ Only applicable for `streamlit` SDK.
32
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
33
+
34
+ `app_file`: _string_
35
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
36
+ Path is relative to the root of the repository.
37
+
38
+ `models`: _List[string]_
39
+ HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
40
+ Will be parsed automatically from your code if not specified here.
41
+
42
+ `datasets`: _List[string]_
43
+ HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
44
+ Will be parsed automatically from your code if not specified here.
45
+
46
+ `pinned`: _boolean_
47
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import shutil
3
+ import os
4
+ import tempfile
5
+
6
+ from collections import OrderedDict
7
+ from glob import glob
8
+
9
+ import numpy
10
+ import torch
11
+ import torchaudio
12
+ import torchaudio.functional as F
13
+
14
+ from pydub import AudioSegment
15
+ from tqdm import tqdm
16
+
17
+ from speechbrain.pretrained import VAD
18
+ from speechbrain.pretrained import EncoderASR
19
+
20
+ import gradio as gr
21
+
22
+ tempdir = tempfile.mkdtemp()
23
+
24
+ def read_and_resample(filename, outdir):
25
+ # load the file
26
+
27
+ AudioSegment.from_file(filename).export(f"{filename}.wav", format='wav', parameters=["-ar", "16000", "-ac", '1'])
28
+ filename = f"{filename}.wav"
29
+
30
+ signal, sr = torchaudio.load(filename)
31
+ if sr != 16_000:
32
+ # downsample to 16khz and mono
33
+ resampled = F.resample(signal, sr, 16_000, lowpass_filter_width=128).mean(dim=0).view(1, -1).cpu()
34
+ else:
35
+ resampled = signal.mean(dim=0).view(1, -1).cpu()
36
+
37
+ # get tmp dir:
38
+ filename = os.path.basename(filename).split(".")[0]
39
+
40
+ # yield segments of 90 minutes.
41
+ c_size = 60 * 60 * 16_000
42
+ for i, c in enumerate(range(0, resampled.shape[1], c_size)):
43
+ tempaudio = os.path.join(outdir, f"{filename}-{i}.wav")
44
+
45
+ # save to tmp dir:
46
+ torchaudio.save(tempaudio, resampled[:, c:c+c_size], 16_000)
47
+ yield (tempaudio, resampled[:, c:c+c_size])
48
+
49
+
50
+ def segment_file(VAD, id, prefix, filename, resampled, output_dir):
51
+
52
+ min_chunk_size = 4 # seconds
53
+ max_allowed_length = 12 # seconds
54
+ margin = 0.15
55
+
56
+ with torch.no_grad():
57
+ audio_info = VAD.get_speech_segments(filename, apply_energy_VAD=True, len_th=0.5,
58
+ deactivation_th=0.4, double_check=False, close_th=0.25)
59
+
60
+ # save segments:
61
+ s = -1
62
+ for _s, _e in audio_info:
63
+ _s, _e = _s.item(), _e.item()
64
+
65
+ _s = max(0, _s - margin)
66
+ e = min(resampled.size(1) / 16_000, _e + margin)
67
+
68
+ if s == -1:
69
+ s = _s
70
+
71
+ chunk_length = e - s
72
+ if chunk_length > min_chunk_size:
73
+
74
+ no_chunks = int(numpy.ceil(chunk_length / max_allowed_length))
75
+ starts = numpy.linspace(s, e, no_chunks + 1).tolist()
76
+
77
+ if chunk_length > max_allowed_length:
78
+ print("WARNING: segment too long:", chunk_length)
79
+ print(no_chunks, starts)
80
+
81
+ for x in range(no_chunks):
82
+
83
+ start = starts[x]
84
+ end = starts[x + 1]
85
+
86
+ local_chunk_length = end - start
87
+
88
+ print(f"Saving segment: {start:08.2f}-{end:08.2f}, with length: {local_chunk_length:05.2f} secs")
89
+ fname = f"{id}-{prefix}-{start:08.2f}-{end:08.2f}.wav"
90
+
91
+ # convert from seconds to samples:
92
+ start = int(start * 16_000)
93
+ end = int(end * 16_000)
94
+
95
+ # save segment:
96
+ torchaudio.save(os.path.join(output_dir, fname), resampled[:, start:end], 16_000)
97
+ s = -1
98
+
99
+
100
+ def format_time(secs: float):
101
+ m, s = divmod(secs, 60)
102
+ h, m = divmod(m, 60)
103
+ return "%d:%02d:%02d,%03d" % (h, m, s, int(secs * 1000 % 1000))
104
+
105
+ asr_model = EncoderASR.from_hparams(source="asafaya/hubert-large-arabic-transcribe")
106
+ vad_model = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty")
107
+
108
+ def main(filename, generate_srt=False):
109
+ try:
110
+ AudioSegment.from_file(filename)
111
+ except:
112
+ return "Please upload a valid audio file"
113
+
114
+ outdir = os.path.join(tempdir, filename.split("/")[-1].split(".")[0])
115
+ if not os.path.exists(outdir):
116
+ os.mkdir(outdir)
117
+
118
+ print("Applying VAD to", filename)
119
+
120
+ # directory to save
121
+ segments_dir = os.path.join(outdir, "segments")
122
+ if os.path.exists(segments_dir):
123
+ raise Exception(f"Segments directory already exists: {segments_dir}")
124
+ os.mkdir(segments_dir)
125
+ print("Saving segments to", segments_dir)
126
+
127
+ for c, (tempaudio, resampled) in enumerate(read_and_resample(filename, outdir)):
128
+ print(f"Segmenting file: {filename}, with length: {resampled.shape[1] / 16_000:05.2f} secs: {tempaudio}")
129
+ segment_file(vad_model, os.path.basename(tempaudio), c, tempaudio, resampled, segments_dir)
130
+ # os.remove(tempaudio)
131
+
132
+ transcriptions = OrderedDict()
133
+ files = glob(os.path.join(segments_dir, "*.wav"))
134
+ print("Start transcribing")
135
+ for f in tqdm(sorted(files)):
136
+ try:
137
+ transcriptions[os.path.basename(f).replace(".wav", "")] = asr_model.transcribe_file(f)
138
+ # os.remove(os.path.basename(f))
139
+ except Exception as e:
140
+ print(e)
141
+ print("Error transcribing file {}".format(f))
142
+ print("Skipping...")
143
+
144
+ # shutil.rmtree(outdir)
145
+
146
+ fo = ""
147
+ for i, key in enumerate(transcriptions):
148
+ line = key
149
+
150
+ # segment-0-00148.72-00156.97
151
+ start_sec = float(line.split("-")[-2])
152
+ end_sec = float(line.split("-")[-1])
153
+ if len(line) < 2: continue
154
+
155
+ if generate_srt:
156
+ fo += ("{}\n".format(i+1))
157
+ fo += ("{} --> ".format(format_time(start_sec)))
158
+ fo += ("{}\n".format(format_time(end_sec)))
159
+
160
+ fo += ("{}\n".format(transcriptions[key]))
161
+ fo += ("\n") if generate_srt else ""
162
+
163
+ return fo
164
+
165
+ outputs = gr.outputs.Textbox(label="Transcription")
166
+
167
+ title = "Arabic Speech Transcription"
168
+ description = "Simply upload your audio."
169
+
170
+ gr.Interface(main, [gr.inputs.Audio(label="Arabic Audio File", type="filepath"), "checkbox"], outputs, title=title, description=description, enable_queue=True).launch()
output.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f7174c9be1bd14e7bda67f0578c1c0f75a2270017065ea0e79381c6f406e005
3
+ size 320078
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ speechbrain==0.5.13
2
+ transformers==4.22.2
3
+ pydub==0.25.1
4
+ gradio==3.8.2