UnityPaul commited on
Commit
7e46d24
·
verified ·
1 Parent(s): e64b554

Upload RunWhisper.cs

Browse files
Files changed (1) hide show
  1. RunWhisper.cs +15 -19
RunWhisper.cs CHANGED
@@ -49,10 +49,6 @@ public class RunWhisper : MonoBehaviour
49
  const int TRANSLATE = 50358; //for speech-to-text then translate to English
50
  const int NO_TIME_STAMPS = 50363;
51
  const int START_TIME = 50364;
52
-
53
-
54
- Ops ops;
55
- ITensorAllocator allocator;
56
 
57
  int numSamples;
58
  float[] data;
@@ -74,18 +70,22 @@ public class RunWhisper : MonoBehaviour
74
 
75
  void Start()
76
  {
77
- allocator = new TensorCachingAllocator();
78
- ops = WorkerFactory.CreateOps(backend, allocator);
79
 
80
  SetupWhiteSpaceShifts();
81
 
82
  GetTokens();
83
 
84
  Model decoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioDecoder_Tiny.sentis");
 
 
 
 
 
 
85
  Model encoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioEncoder_Tiny.sentis");
86
  Model spectro = ModelLoader.Load(Application.streamingAssetsPath + "/LogMelSepctro.sentis");
87
 
88
- decoderEngine = WorkerFactory.CreateWorker(backend, decoder);
89
  encoderEngine = WorkerFactory.CreateWorker(backend, encoder);
90
  spectroEngine = WorkerFactory.CreateWorker(backend, spectro);
91
 
@@ -116,7 +116,9 @@ public class RunWhisper : MonoBehaviour
116
  return;
117
  }
118
 
119
- data = new float[numSamples];
 
 
120
  audioClip.GetData(data, 0);
121
  }
122
 
@@ -136,10 +138,7 @@ public class RunWhisper : MonoBehaviour
136
  {
137
  using var input = new TensorFloat(new TensorShape(1, numSamples), data);
138
 
139
- // Pad out to 30 seconds at 16khz if necessary
140
- using var input30seconds = ops.Pad(input, new int[] { 0, 0, 0, maxSamples - numSamples });
141
-
142
- spectroEngine.Execute(input30seconds);
143
  var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
144
 
145
  encoderEngine.Execute(spectroOutput);
@@ -156,15 +155,14 @@ public class RunWhisper : MonoBehaviour
156
 
157
  var inputs = new Dictionary<string, Tensor>
158
  {
159
- {"encoded_audio",encodedAudio },
160
- {"tokens" , tokensSoFar }
161
  };
162
 
163
  decoderEngine.Execute(inputs);
164
- var tokensOut = decoderEngine.PeekOutput() as TensorFloat;
165
 
166
- using var tokensPredictions = ops.ArgMax(tokensOut, 2, false);
167
- tokensPredictions.MakeReadable();
168
 
169
  int ID = tokensPredictions[currentToken];
170
 
@@ -225,7 +223,5 @@ public class RunWhisper : MonoBehaviour
225
  decoderEngine?.Dispose();
226
  encoderEngine?.Dispose();
227
  spectroEngine?.Dispose();
228
- ops?.Dispose();
229
- allocator?.Dispose();
230
  }
231
  }
 
49
  const int TRANSLATE = 50358; //for speech-to-text then translate to English
50
  const int NO_TIME_STAMPS = 50363;
51
  const int START_TIME = 50364;
 
 
 
 
52
 
53
  int numSamples;
54
  float[] data;
 
70
 
71
  void Start()
72
  {
 
 
73
 
74
  SetupWhiteSpaceShifts();
75
 
76
  GetTokens();
77
 
78
  Model decoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioDecoder_Tiny.sentis");
79
+
80
+ Model decoderWithArgMax = Functional.Compile(
81
+ (tokens, audio) => Functional.ArgMax(decoder.Forward(tokens, audio)[0], 2),
82
+ (decoder.inputs[0], decoder.inputs[1])
83
+ );
84
+
85
  Model encoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioEncoder_Tiny.sentis");
86
  Model spectro = ModelLoader.Load(Application.streamingAssetsPath + "/LogMelSepctro.sentis");
87
 
88
+ decoderEngine = WorkerFactory.CreateWorker(backend, decoderWithArgMax);
89
  encoderEngine = WorkerFactory.CreateWorker(backend, encoder);
90
  spectroEngine = WorkerFactory.CreateWorker(backend, spectro);
91
 
 
116
  return;
117
  }
118
 
119
+ data = new float[maxSamples];
120
+ numSamples = maxSamples;
121
+ //We will get a warning here if data.length is larger than audio length but that is OK
122
  audioClip.GetData(data, 0);
123
  }
124
 
 
138
  {
139
  using var input = new TensorFloat(new TensorShape(1, numSamples), data);
140
 
141
+ spectroEngine.Execute(input);
 
 
 
142
  var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
143
 
144
  encoderEngine.Execute(spectroOutput);
 
155
 
156
  var inputs = new Dictionary<string, Tensor>
157
  {
158
+ {"input_0", tokensSoFar },
159
+ {"input_1", encodedAudio }
160
  };
161
 
162
  decoderEngine.Execute(inputs);
163
+ var tokensPredictions = decoderEngine.PeekOutput() as TensorInt;
164
 
165
+ tokensPredictions.CompleteOperationsAndDownload();
 
166
 
167
  int ID = tokensPredictions[currentToken];
168
 
 
223
  decoderEngine?.Dispose();
224
  encoderEngine?.Dispose();
225
  spectroEngine?.Dispose();
 
 
226
  }
227
  }