Paul Bird commited on
Commit
91fd4e7
1 Parent(s): 5d20485

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ AudioDecoder_Tiny.sentis filter=lfs diff=lfs merge=lfs -text
37
+ AudioEncoder_Tiny.sentis filter=lfs diff=lfs merge=lfs -text
38
+ LogMelSepctro.sentis filter=lfs diff=lfs merge=lfs -text
AudioDecoder_Tiny.sentis ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6d24553eda46f335ead8ba30e3970fc8056086a538047248821aa31a135f938
3
+ size 198832845
AudioEncoder_Tiny.sentis ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3fb532b04b438079db8de9551a0d813da22be5fd05cdeeff3d09794492ca5b1
3
+ size 32888514
LogMelSepctro.sentis ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e021007141fdf2d39113ea1aa12bc258226ea1c2976171544f3a05979e2b69ef
3
+ size 1360848
RunWhisper.cs ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ using System.Collections;
2
+ using System.Collections.Generic;
3
+ using UnityEngine;
4
+ using Unity.Sentis;
5
+ using System.IO;
6
+ using Newtonsoft.Json;
7
+ using System.Text;
8
+
9
+ /*
10
+ * Whisper Inference Code
11
+ * ======================
12
+ *
13
+ * Put this script on the Main Camera
14
+ *
15
+ * In Assets/StreamingAssets put:
16
+ *
17
+ * AudioDecoder_Tiny.sentis
18
+ * AudioEncoder_Tiny.sentis
19
+ * LogMelSepctro.sentis
20
+ * vocab.json
21
+ *
22
+ * Drag a 30s 16khz mono uncompressed audioclip into the audioClip field.
23
+ *
24
+ * Install package com.unity.nuget.newtonsoft-json from packagemanger
25
+ * Install package com.unity.sentis
26
+ *
27
+ */
28
+
29
+
30
+ public class RunWhisper : MonoBehaviour
31
+ {
32
+ IWorker decoderEngine, encoderEngine, spectroEngine;
33
+
34
+ const BackendType backend = BackendType.GPUCompute;
35
+
36
+ // Link your audioclip here. Format must be 16Hz mono non-compressed.
37
+ public AudioClip audioClip;
38
+
39
+ const int maxTokens = 100;
40
+
41
+ //Special tokens
42
+ const int END_OF_TEXT = 50257;
43
+ const int START_OF_TRANSCRIPT = 50258;
44
+ const int ENGLISH = 50259;
45
+ const int TRANSCRIBE = 50359;
46
+ const int START_TIME = 50364;
47
+
48
+ Ops ops;
49
+ ITensorAllocator allocator;
50
+
51
+ int numSamples;
52
+ float[] data;
53
+ string[] tokens;
54
+
55
+ int currentToken = 0;
56
+ int[] outputTokens = new int[maxTokens];
57
+
58
+ // Used for special character decoding
59
+ int[] shiftDownDict = new int[256];
60
+
61
+ TensorFloat encodedAudio;
62
+
63
+ bool transcribe = false;
64
+ string outputString = "";
65
+
66
+ void Start()
67
+ {
68
+ allocator = new TensorCachingAllocator();
69
+ ops = WorkerFactory.CreateOps(backend, allocator);
70
+
71
+ SetupCharacterShifts();
72
+
73
+ GetTokens();
74
+
75
+ Model decoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioDecoder_Tiny.sentis");
76
+ Model encoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioEncoder_Tiny.sentis");
77
+ Model spectro = ModelLoader.Load(Application.streamingAssetsPath + "/LogMelSepctro.sentis");
78
+
79
+ decoderEngine = WorkerFactory.CreateWorker(backend, decoder);
80
+ encoderEngine = WorkerFactory.CreateWorker(backend, encoder);
81
+ spectroEngine = WorkerFactory.CreateWorker(backend, spectro);
82
+
83
+ outputTokens[0] = START_OF_TRANSCRIPT;
84
+ outputTokens[1] = ENGLISH;
85
+ outputTokens[2] = TRANSCRIBE;
86
+ outputTokens[3] = START_TIME;
87
+ currentToken = 3;
88
+
89
+ LoadAudio();
90
+ EncodeAudio();
91
+ transcribe = true;
92
+ }
93
+
94
+ void LoadAudio()
95
+ {
96
+ if(audioClip.frequency != 16000)
97
+ {
98
+ Debug.Log($"The audio clip should have frequency 16kHz. It has frequency {audioClip.frequency / 1000f}kHz");
99
+ }
100
+
101
+ numSamples = audioClip.samples;
102
+ data = new float[numSamples];
103
+ audioClip.GetData(data, 0);
104
+ }
105
+
106
+
107
+ void GetTokens()
108
+ {
109
+ var jsonText = File.ReadAllText(Application.streamingAssetsPath + "/vocab.json");
110
+ var vocab = Newtonsoft.Json.JsonConvert.DeserializeObject<Dictionary<string, int>>(jsonText);
111
+ tokens = new string[vocab.Count];
112
+ foreach(var item in vocab)
113
+ {
114
+ tokens[item.Value] = item.Key;
115
+ }
116
+ }
117
+
118
+ void EncodeAudio()
119
+ {
120
+ var input = new TensorFloat(new TensorShape(1, numSamples), data);
121
+
122
+ int maxSamples = 30 * 16000;
123
+ if (numSamples > maxSamples)
124
+ {
125
+ Debug.Log("The AudioClip is too long.");
126
+ return;
127
+ }
128
+
129
+ // Pad out to 30 seconds at 16khz if necessary
130
+ var input30seconds = ops.Pad(input, new int[] { 0, 0, 0, 30 * 16000 - numSamples });
131
+
132
+ spectroEngine.Execute(input30seconds);
133
+ var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
134
+
135
+ encoderEngine.Execute(spectroOutput);
136
+ encodedAudio = encoderEngine.PeekOutput() as TensorFloat;
137
+ }
138
+
139
+
140
+ // Update is called once per frame
141
+ void Update()
142
+ {
143
+ if (transcribe && currentToken < outputTokens.Length - 1)
144
+ {
145
+ var tokensSoFar = new TensorInt(new TensorShape(1, outputTokens.Length), outputTokens);
146
+
147
+ var inputs = new Dictionary<string, Tensor>
148
+ {
149
+ {"encoded_audio",encodedAudio },
150
+ {"tokens" , tokensSoFar }
151
+ };
152
+
153
+ decoderEngine.Execute(inputs);
154
+ var tokensOut = decoderEngine.PeekOutput() as TensorFloat;
155
+
156
+ var tokensPredictions = ops.ArgMax(tokensOut, 2, false);
157
+ tokensPredictions.MakeReadable();
158
+
159
+ int ID = tokensPredictions[currentToken];
160
+
161
+ currentToken++;
162
+ outputTokens[currentToken] = ID;
163
+
164
+ if (ID == END_OF_TEXT)
165
+ {
166
+ transcribe = false;
167
+ }
168
+ else if (ID >= tokens.Length) outputString += $"(time={(ID - START_TIME) * 0.02f})";
169
+ else outputString += GetUnicodeText(tokens[ID]);
170
+
171
+ Debug.Log(outputString);
172
+ }
173
+ }
174
+
175
+ // Translates encoded special characters to Unicode
176
+ string GetUnicodeText(string text)
177
+ {
178
+ var bytes = Encoding.GetEncoding("ISO-8859-1").GetBytes(ShiftCharacterDown(text));
179
+ return Encoding.UTF8.GetString(bytes);
180
+ }
181
+
182
+ string ShiftCharacterDown(string text)
183
+ {
184
+ string outText = "";
185
+ foreach (char letter in text)
186
+ {
187
+ outText += ((int)letter <= 256) ? letter :
188
+ (char)shiftDownDict[(int)(letter - 256)];
189
+ }
190
+ return outText;
191
+ }
192
+
193
+ void SetupCharacterShifts()
194
+ {
195
+ for (int i = 0, n = 0; i < 256; i++)
196
+ {
197
+ if (IsWhiteSpace((char)i)) shiftDownDict[n++] = i;
198
+ }
199
+ }
200
+
201
+ bool IsWhiteSpace(char c)
202
+ {
203
+ return !(('!' <= c && c <= '~') || ('�' <= c && c <= '�') || ('�' <= c && c <= '�'));
204
+ }
205
+
206
+ private void OnDestroy()
207
+ {
208
+ decoderEngine?.Dispose();
209
+ encoderEngine?.Dispose();
210
+ spectroEngine?.Dispose();
211
+ ops?.Dispose();
212
+ }
213
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff