Varun Aditya Balaji commited on
Commit
a216922
β€’
1 Parent(s): 836ffb0

first commit

Browse files
2.wav ADDED
Binary file (740 kB). View file
 
3.wav ADDED
Binary file (436 kB). View file
 
Pipeline.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 41,
6
  "id": "edc2e2ff",
7
  "metadata": {},
8
  "outputs": [],
@@ -19,7 +19,7 @@
19
  },
20
  {
21
  "cell_type": "code",
22
- "execution_count": 19,
23
  "id": "76f25cc3",
24
  "metadata": {},
25
  "outputs": [
@@ -27,7 +27,7 @@
27
  "name": "stdout",
28
  "output_type": "stream",
29
  "text": [
30
- "11/08/2022 14:17:47 - INFO - huggingsound.speech_recognition.model - Loading model...\n"
31
  ]
32
  },
33
  {
@@ -41,8 +41,8 @@
41
  "name": "stdout",
42
  "output_type": "stream",
43
  "text": [
44
- "11/08/2022 14:17:49 - WARNING - root - bos_token <s> not in provided tokens. It will be added to the list of tokens\n",
45
- "11/08/2022 14:17:49 - WARNING - root - eos_token </s> not in provided tokens. It will be added to the list of tokens\n"
46
  ]
47
  }
48
  ],
@@ -55,7 +55,7 @@
55
  },
56
  {
57
  "cell_type": "code",
58
- "execution_count": 38,
59
  "id": "3b142546",
60
  "metadata": {},
61
  "outputs": [],
@@ -82,8 +82,8 @@
82
  },
83
  {
84
  "cell_type": "code",
85
- "execution_count": 39,
86
- "id": "48bed0f8",
87
  "metadata": {},
88
  "outputs": [
89
  {
@@ -98,27 +98,34 @@
98
  "output_type": "stream",
99
  "text": [
100
  "Detected language is English\n",
101
- "WITHOUT THE DATA SET THE ARTICLE IS USELESS\n"
102
  ]
103
  }
104
  ],
105
  "source": [
106
- "class Speech_to_Text(Pipeline):\n",
107
- " def postprocess(self,model_outputs):\n",
108
- " if prediction[3][0] == 'zh: Chinese':\n",
109
- " "
110
  ]
111
  },
112
  {
113
  "cell_type": "code",
114
- "execution_count": 51,
115
- "id": "b0fae1dd",
116
  "metadata": {},
117
  "outputs": [
 
 
 
 
 
 
 
118
  {
119
  "name": "stderr",
120
  "output_type": "stream",
121
  "text": [
 
122
  "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
123
  ]
124
  },
@@ -126,36 +133,57 @@
126
  "name": "stdout",
127
  "output_type": "stream",
128
  "text": [
129
- "Detected language is English\n",
130
- "WITHOUT THE DATA SET THE ARTICLE IS USELESS\n"
131
  ]
132
- }
133
- ],
134
- "source": [
135
- "start = time.time()\n",
136
- "pipeline('english.wav')\n",
137
- "end = time.time()"
138
- ]
139
- },
140
- {
141
- "cell_type": "code",
142
- "execution_count": 49,
143
- "id": "1e0321b5",
144
- "metadata": {},
145
- "outputs": [
146
  {
147
- "data": {
148
- "text/plain": [
149
- "0.5424931049346924"
150
- ]
151
- },
152
- "execution_count": 49,
153
- "metadata": {},
154
- "output_type": "execute_result"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  }
156
  ],
157
  "source": [
158
- "end - start"
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  ]
160
  },
161
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 92,
6
  "id": "edc2e2ff",
7
  "metadata": {},
8
  "outputs": [],
 
19
  },
20
  {
21
  "cell_type": "code",
22
+ "execution_count": 93,
23
  "id": "76f25cc3",
24
  "metadata": {},
25
  "outputs": [
 
27
  "name": "stdout",
28
  "output_type": "stream",
29
  "text": [
30
+ "12/06/2022 13:42:19 - INFO - huggingsound.speech_recognition.model - Loading model...\n"
31
  ]
32
  },
33
  {
 
41
  "name": "stdout",
42
  "output_type": "stream",
43
  "text": [
44
+ "12/06/2022 13:42:23 - WARNING - root - bos_token <s> not in provided tokens. It will be added to the list of tokens\n",
45
+ "12/06/2022 13:42:23 - WARNING - root - eos_token </s> not in provided tokens. It will be added to the list of tokens\n"
46
  ]
47
  }
48
  ],
 
55
  },
56
  {
57
  "cell_type": "code",
58
+ "execution_count": 94,
59
  "id": "3b142546",
60
  "metadata": {},
61
  "outputs": [],
 
82
  },
83
  {
84
  "cell_type": "code",
85
+ "execution_count": 95,
86
+ "id": "b0fae1dd",
87
  "metadata": {},
88
  "outputs": [
89
  {
 
98
  "output_type": "stream",
99
  "text": [
100
  "Detected language is English\n",
101
+ "NISHE JUAN FANMA HE MOVED ABOUT INVISIBLE BUT EVERYONE COULD HEAR HIM\n"
102
  ]
103
  }
104
  ],
105
  "source": [
106
+ "start = time.time()\n",
107
+ "pipeline('combine.wav')\n",
108
+ "end = time.time()"
 
109
  ]
110
  },
111
  {
112
  "cell_type": "code",
113
+ "execution_count": 96,
114
+ "id": "1e0321b5",
115
  "metadata": {},
116
  "outputs": [
117
+ {
118
+ "name": "stdout",
119
+ "output_type": "stream",
120
+ "text": [
121
+ "Detected Language is Chinese\n"
122
+ ]
123
+ },
124
  {
125
  "name": "stderr",
126
  "output_type": "stream",
127
  "text": [
128
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 1.28it/s]\n",
129
  "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
130
  ]
131
  },
 
133
  "name": "stdout",
134
  "output_type": "stream",
135
  "text": [
136
+ "δ½ ε–œζ¬’ι₯­ε—\n",
137
+ "Detected language is English\n"
138
  ]
139
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  {
141
+ "name": "stderr",
142
+ "output_type": "stream",
143
+ "text": [
144
+ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
145
+ ]
146
+ },
147
+ {
148
+ "name": "stdout",
149
+ "output_type": "stream",
150
+ "text": [
151
+ "\n",
152
+ "Detected language is English\n"
153
+ ]
154
+ },
155
+ {
156
+ "name": "stderr",
157
+ "output_type": "stream",
158
+ "text": [
159
+ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
160
+ ]
161
+ },
162
+ {
163
+ "name": "stdout",
164
+ "output_type": "stream",
165
+ "text": [
166
+ "HE MOVED ABOUT\n",
167
+ "Detected language is English\n",
168
+ "INVISIBLE BUT EVERYONE COULD HEAR HIM\n"
169
+ ]
170
  }
171
  ],
172
  "source": [
173
+ "from pydub import AudioSegment\n",
174
+ "from pydub.silence import split_on_silence\n",
175
+ "\n",
176
+ "sound_file = AudioSegment.from_wav(\"combine.wav\")\n",
177
+ "audio_chunks = split_on_silence(sound_file, \n",
178
+ " min_silence_len=100,\n",
179
+ " silence_thresh=-50\n",
180
+ ")\n",
181
+ "\n",
182
+ "for i, chunk in enumerate(audio_chunks):\n",
183
+ "\n",
184
+ " out_file = \"./chunk{0}.wav\".format(i)\n",
185
+ " chunk.export(out_file, format=\"wav\")\n",
186
+ " pipeline(out_file)"
187
  ]
188
  },
189
  {
chunk0.wav ADDED
Binary file (196 kB). View file
 
chunk1.wav ADDED
Binary file (21.3 kB). View file
 
chunk2.wav ADDED
Binary file (96.6 kB). View file
 
chunk3.wav ADDED
Binary file (188 kB). View file
 
chunk4.wav ADDED
Binary file (79.3 kB). View file
 
chunk5.wav ADDED
Binary file (18.5 kB). View file
 
chunk6.wav ADDED
Binary file (189 kB). View file
 
combine.wav ADDED
Binary file (603 kB). View file
 
tmp/classifier.ckpt ADDED
@@ -0,0 +1 @@
 
 
1
+ /Users/varun/.cache/huggingface/hub/models--speechbrain--lang-id-voxlingua107-ecapa/snapshots/d771b530cec097adc0088b4dbd173e242f895464/classifier.ckpt
tmp/embedding_model.ckpt ADDED
@@ -0,0 +1 @@
 
 
1
+ /Users/varun/.cache/huggingface/hub/models--speechbrain--lang-id-voxlingua107-ecapa/snapshots/d771b530cec097adc0088b4dbd173e242f895464/embedding_model.ckpt
tmp/hyperparams.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ /Users/varun/.cache/huggingface/hub/models--speechbrain--lang-id-voxlingua107-ecapa/snapshots/d771b530cec097adc0088b4dbd173e242f895464/hyperparams.yaml
tmp/label_encoder.ckpt ADDED
@@ -0,0 +1 @@
 
 
1
+ /Users/varun/.cache/huggingface/hub/models--speechbrain--lang-id-voxlingua107-ecapa/snapshots/d771b530cec097adc0088b4dbd173e242f895464/label_encoder.txt