Sky-Kim's picture
Initial commit
6ac63e1
using UnityEngine;
using System;
using System.Runtime.InteropServices;
using System.Threading.Tasks;
using Unity.InferenceEngine;
using UnityEngine.UI;
public class ASRManager : MonoBehaviour
{
private bool _platformMicrophoneSupported = true;
public enum State
{
Initializing, Ready, Listening, Speaking, STTProcessing, Error
}
private State _currentState = State.Initializing;
public State currentState => _currentState;
public enum InferenceBackend
{
GPUCompute = 0,
CPU = 1
}
[SerializeField] private Slider m_VadProbabilitySlider;
[SerializeField] private Image m_VadProbabilityFill;
[SerializeField] private Text m_TextFps;
[SerializeField] private MonoBehaviour m_AsrRunnerComponent;
[SerializeField, Range(0f, 1f)] private float m_VadThreshold = 0.5f;
[SerializeField, Min(1)] private int m_PrePostBufferFrames = 20;
[SerializeField, Min(0.1f)] private float m_MaxAudioStreamSeconds = 10f;
private IASRRunner _activeRunner;
private float _currentVadProbability;
private TenVADRunner _vad;
private string _selectedMicrophone;
private AudioClip _microphoneClip;
private int _lastPosition = 0;
private int _consecutiveSilenceFrames = 0;
private float _currentRecordingTime = 0f;
private bool _isListeningSession;
private bool _webglMicrophoneInitialized;
#if UNITY_WEBGL && !UNITY_EDITOR
private bool _webglMicPluginAvailable = true;
private bool _webglMicPluginWarningLogged;
#endif
private const int HOP_SIZE = 256;
private const int TARGET_SAMPLE_RATE = 16000;
private const float FPS_UPDATE_INTERVAL = 0.25f;
private const float VAD_TEXT_UPDATE_INTERVAL = 0.05f;
private const int MAX_CHUNKS_PER_FRAME = 24;
#if UNITY_WEBGL && !UNITY_EDITOR
[DllImport("__Internal")]
private static extern int WebGLMic_Start(int sampleRate);
[DllImport("__Internal")]
private static extern void WebGLMic_Stop();
[DllImport("__Internal")]
private static extern int WebGLMic_GetSamples(float[] buffer, int maxSamples);
[DllImport("__Internal")]
private static extern int WebGLMic_IsRecording();
#endif
private CircularBuffer _microphoneCircularBuffer;
private CircularBuffer _preSpeechCircularBuffer;
private float[] _reusableReadBuffer;
private float[] _reusableProcessChunk;
private short[] _reusableShortChunk;
private float _fpsElapsed;
private int _fpsFrameCount;
private float _nextVadTextUpdateTime;
// Event for state changes
public event Action<State> OnStateChanged;
public event Action<string> OnSpeechTextReceived;
private async void Start()
{
SetState(State.Initializing);
try
{
InitializeBuffers();
await InitializeASRRunner();
_vad = new TenVADRunner((UIntPtr)HOP_SIZE, m_VadThreshold);
UpdateVadProbabilityText();
InitializeMicrophone();
SetState(State.Ready);
}
catch (Exception e)
{
Debug.LogError($"[ASRManager] Initialization failed: {e.Message}\n{e.StackTrace}");
SetState(State.Error);
}
}
private void Reset()
{
if (m_AsrRunnerComponent == null)
m_AsrRunnerComponent = ResolveASRRunnerComponent();
}
#if UNITY_EDITOR
private void OnValidate()
{
m_VadThreshold = Mathf.Clamp01(m_VadThreshold);
m_PrePostBufferFrames = Mathf.Max(1, m_PrePostBufferFrames);
m_MaxAudioStreamSeconds = Mathf.Max(0.1f, m_MaxAudioStreamSeconds);
if (m_AsrRunnerComponent == null)
m_AsrRunnerComponent = ResolveASRRunnerComponent();
}
#endif
private void Update()
{
UpdateFpsText();
if (_currentState == State.Listening || _currentState == State.Speaking)
{
ReadMicrophoneData();
ProcessAudioChunks();
CheckMicrophoneStatus();
}
}
private void OnDestroy()
{
if (_activeRunner != null)
{
_activeRunner.OnFinalResult -= OnFinalResultReceived;
}
if (!string.IsNullOrEmpty(_selectedMicrophone) &&
IsMicrophoneRecording(_selectedMicrophone))
{
EndMicrophone(_selectedMicrophone);
}
_vad?.Dispose();
_activeRunner?.Dispose();
}
public bool TrySetInferenceBackend(InferenceBackend backend)
{
if (_activeRunner == null || _isReinitializing)
return false;
if (_currentState == State.STTProcessing)
return false;
if (_currentState == State.Listening || _currentState == State.Speaking)
StopListening(processCurrentSegment: false);
_activeRunner.SetPreferredBackend(ToRunnerBackend(backend));
_ = ReinitializeRunnerAsync();
return true;
}
private bool _isReinitializing;
private async Task ReinitializeRunnerAsync()
{
if (_isReinitializing)
return;
_isReinitializing = true;
SetState(State.Initializing);
try
{
await _activeRunner.ReinitializeAsync();
SetState(State.Ready);
}
catch (Exception e)
{
Debug.LogError($"[ASRManager] Reinitialization failed: {e.Message}\n{e.StackTrace}");
SetState(State.Error);
}
finally
{
_isReinitializing = false;
}
}
private static BackendType ToRunnerBackend(InferenceBackend backend)
{
return backend == InferenceBackend.CPU ? BackendType.CPU : BackendType.GPUCompute;
}
public bool Listen()
{
switch (_currentState)
{
case State.Listening:
case State.Speaking:
return true;
case State.STTProcessing:
case State.Error:
case State.Initializing:
return false;
}
if (!_platformMicrophoneSupported)
{
return false;
}
_isListeningSession = true;
StartMicrophone();
if (_microphoneClip == null && !IsMicrophoneRecording(_selectedMicrophone))
return false;
SetState(State.Listening);
return true;
}
public bool StopListening(bool processCurrentSegment = false)
{
switch (_currentState)
{
case State.Ready:
case State.STTProcessing:
case State.Error:
case State.Initializing:
return false;
}
_isListeningSession = false;
if (_currentState == State.Listening)
{
StopMicrophone();
_consecutiveSilenceFrames = 0;
_currentRecordingTime = 0f;
_preSpeechCircularBuffer.Clear();
_activeRunner.CancelSpeechSegment();
SetState(State.Ready);
return true;
}
if (_currentState == State.Speaking)
{
if (processCurrentSegment)
{
_activeRunner.EndSpeechSegment();
}
else
{
_activeRunner.CancelSpeechSegment();
}
_preSpeechCircularBuffer.Clear();
StopMicrophone();
SetState(State.Ready);
_consecutiveSilenceFrames = 0;
_currentRecordingTime = 0f;
return true;
}
return false;
}
private void InitializeBuffers()
{
var bufferFrames = Mathf.Max(1, m_PrePostBufferFrames);
_microphoneCircularBuffer = new CircularBuffer(TARGET_SAMPLE_RATE * 2);
_preSpeechCircularBuffer = new CircularBuffer(HOP_SIZE * bufferFrames);
_reusableReadBuffer = new float[HOP_SIZE * 4];
_reusableProcessChunk = new float[HOP_SIZE];
_reusableShortChunk = new short[HOP_SIZE];
}
private async Task InitializeASRRunner()
{
if (m_AsrRunnerComponent == null)
m_AsrRunnerComponent = ResolveASRRunnerComponent();
if (m_AsrRunnerComponent == null)
{
throw new ArgumentNullException("ASR Runner Component is not assigned in the Inspector.");
}
_activeRunner = m_AsrRunnerComponent as IASRRunner;
if (_activeRunner == null)
{
throw new InvalidCastException($"The component '{m_AsrRunnerComponent.GetType().Name}' must implement IASRRunner.");
}
_activeRunner.OnFinalResult += OnFinalResultReceived;
await _activeRunner.Initialize();
}
private MonoBehaviour ResolveASRRunnerComponent()
{
// Prefer a runner on the same GameObject to avoid cross-scene mismatches.
var localBehaviours = GetComponents<MonoBehaviour>();
foreach (var behaviour in localBehaviours)
{
if (behaviour is IASRRunner)
return behaviour;
}
#if UNITY_2023_1_OR_NEWER
var allBehaviours = FindObjectsByType<MonoBehaviour>(FindObjectsInactive.Include, FindObjectsSortMode.None);
#else
var allBehaviours = FindObjectsOfType<MonoBehaviour>(true);
#endif
foreach (var behaviour in allBehaviours)
{
if (behaviour is IASRRunner)
return behaviour;
}
return null;
}
private void InitializeMicrophone()
{
if (!_platformMicrophoneSupported)
throw new NotSupportedException("Microphone API is not available on this platform.");
#if UNITY_WEBGL && !UNITY_EDITOR
if (_webglMicrophoneInitialized)
return;
_selectedMicrophone = "WebGL Microphone";
_webglMicrophoneInitialized = true;
return;
#else
string[] devices = Microphone.devices;
if (devices.Length == 0)
throw new InvalidOperationException("No microphone found.");
_selectedMicrophone = devices[0];
#endif
}
private void StartMicrophone()
{
if (!_platformMicrophoneSupported)
return;
if (IsMicrophoneRecording(_selectedMicrophone))
return;
if (string.IsNullOrEmpty(_selectedMicrophone))
InitializeMicrophone();
if (string.IsNullOrEmpty(_selectedMicrophone))
{
Debug.LogError("[ASRManager] No microphone is selected.");
return;
}
#if UNITY_WEBGL && !UNITY_EDITOR
int startResult;
try
{
startResult = WebGLMic_Start(TARGET_SAMPLE_RATE);
}
catch
{
startResult = 0;
}
if (startResult != 1)
{
Debug.LogError($"[ASRManager] Failed to start WebGL microphone. Result={startResult}");
return;
}
if (_microphoneClip == null)
_microphoneClip = AudioClip.Create("webgl-microphone", TARGET_SAMPLE_RATE, 1, TARGET_SAMPLE_RATE, false);
#else
_microphoneClip = Microphone.Start(_selectedMicrophone, true, Mathf.Max(1, Mathf.CeilToInt(m_MaxAudioStreamSeconds) + 1), TARGET_SAMPLE_RATE);
_lastPosition = 0;
#endif
}
private void StopMicrophone()
{
if (_microphoneClip == null && !IsMicrophoneRecording(_selectedMicrophone))
return;
if (IsMicrophoneRecording(_selectedMicrophone))
EndMicrophone(_selectedMicrophone);
_microphoneClip = null;
}
private void ReadMicrophoneData()
{
if (_microphoneClip == null || !IsMicrophoneRecording(_selectedMicrophone))
return;
#if UNITY_WEBGL && !UNITY_EDITOR
int sampleCount = 0;
try
{
sampleCount = WebGLMic_GetSamples(_reusableReadBuffer, _reusableReadBuffer.Length);
}
catch (Exception ex)
{
if (_webglMicPluginAvailable)
{
_webglMicPluginAvailable = false;
if (!_webglMicPluginWarningLogged)
{
Debug.LogWarning($"[ASRManager] WebGL microphone plugin is not available: {ex.Message}");
_webglMicPluginWarningLogged = true;
}
}
return;
}
if (sampleCount <= 0)
return;
_microphoneCircularBuffer.Write(_reusableReadBuffer, sampleCount);
#else
int currentPosition = Microphone.GetPosition(_selectedMicrophone);
if (currentPosition == _lastPosition) return;
int sampleCount = (currentPosition > _lastPosition)
? (currentPosition - _lastPosition)
: (_microphoneClip.samples - _lastPosition + currentPosition);
if (sampleCount > 0)
{
int remaining = sampleCount;
int readPosition = _lastPosition;
while (remaining > 0)
{
int readLength = Mathf.Min(remaining, _reusableReadBuffer.Length);
_microphoneClip.GetData(_reusableReadBuffer, readPosition);
_microphoneCircularBuffer.Write(_reusableReadBuffer, readLength);
remaining -= readLength;
readPosition = (readPosition + readLength) % _microphoneClip.samples;
}
}
_lastPosition = currentPosition;
#endif
}
private void ProcessAudioChunks()
{
var availableChunks = _microphoneCircularBuffer.Count / HOP_SIZE;
if (availableChunks <= 0)
return;
var frameChunkBudget = Mathf.Clamp(availableChunks, 1, Mathf.Max(1, MAX_CHUNKS_PER_FRAME));
int chunksProcessed = 0;
while (_microphoneCircularBuffer.Count >= HOP_SIZE && chunksProcessed < frameChunkBudget)
{
_microphoneCircularBuffer.Read(_reusableProcessChunk, HOP_SIZE);
for (int i = 0; i < HOP_SIZE; i++)
{
_reusableShortChunk[i] = (short)(_reusableProcessChunk[i] * 32767.0f);
}
_vad.Process(_reusableShortChunk, out float probability, out int flag);
UpdateVadDebug(probability);
bool voiceDetected = flag == 1;
switch (_currentState)
{
case State.Listening:
_preSpeechCircularBuffer.Write(_reusableProcessChunk, HOP_SIZE);
if (voiceDetected)
{
StartSpeech();
}
break;
case State.Speaking:
ProcessSpeechChunk(_reusableProcessChunk);
_currentRecordingTime += (float)HOP_SIZE / TARGET_SAMPLE_RATE;
if (voiceDetected)
{
_consecutiveSilenceFrames = 0;
}
else
{
_consecutiveSilenceFrames++;
if (_consecutiveSilenceFrames >= Mathf.Max(1, m_PrePostBufferFrames))
{
EndSpeech();
}
}
if (_currentRecordingTime >= Mathf.Max(0.1f, m_MaxAudioStreamSeconds))
{
EndSpeech();
}
break;
}
chunksProcessed++;
}
}
private void StartSpeech()
{
SetState(State.Speaking);
_currentRecordingTime = 0f;
_consecutiveSilenceFrames = 0;
_activeRunner.StartSpeechSegment();
// Process pre-speech buffer data
int preSpeechDataLength = _preSpeechCircularBuffer.Count;
while (preSpeechDataLength > 0)
{
int chunkLength = Mathf.Min(HOP_SIZE, preSpeechDataLength);
_preSpeechCircularBuffer.Read(_reusableProcessChunk, chunkLength);
ProcessSpeechChunk(_reusableProcessChunk, chunkLength);
preSpeechDataLength -= chunkLength;
}
}
private void ProcessSpeechChunk(float[] audioChunk, int length = -1)
{
int sampleCount = (length <= 0 || length > audioChunk.Length) ? audioChunk.Length : length;
_activeRunner.ProcessAudioChunk(audioChunk, sampleCount);
}
private void EndSpeech()
{
if (_currentState != State.Speaking)
return;
_preSpeechCircularBuffer.Clear();
_activeRunner.EndSpeechSegment();
_consecutiveSilenceFrames = 0;
_currentRecordingTime = 0f;
if (_isListeningSession)
{
// Keep microphone + VAD running continuously while inference consumes queued segments.
SetState(State.Listening);
return;
}
StopMicrophone();
SetState(State.STTProcessing);
}
private void SetState(State newState)
{
if (_currentState == newState) return;
_currentState = newState;
OnStateChanged?.Invoke(newState);
}
private void CheckMicrophoneStatus()
{
if (!string.IsNullOrEmpty(_selectedMicrophone) && !IsMicrophoneRecording(_selectedMicrophone))
{
StartMicrophone();
}
}
private static bool IsMicrophoneRecording(string deviceName)
{
#if UNITY_WEBGL && !UNITY_EDITOR
try
{
return WebGLMic_IsRecording() == 1;
}
catch
{
return false;
}
#else
return Microphone.IsRecording(deviceName);
#endif
}
private static void EndMicrophone(string deviceName)
{
#if UNITY_WEBGL && !UNITY_EDITOR
try
{
WebGLMic_Stop();
}
catch
{
// Ignore if plugin call fails.
}
#else
Microphone.End(deviceName);
#endif
}
private void UpdateVadDebug(float probability)
{
_currentVadProbability = Mathf.Clamp01(probability);
UpdateVadProbabilityText();
}
private void UpdateVadProbabilityText()
{
if (m_VadProbabilitySlider == null)
return;
var interval = Mathf.Max(0.01f, VAD_TEXT_UPDATE_INTERVAL);
var now = Time.unscaledTime;
if (now < _nextVadTextUpdateTime)
return;
_nextVadTextUpdateTime = now + interval;
m_VadProbabilitySlider.minValue = 0f;
m_VadProbabilitySlider.maxValue = 1f;
m_VadProbabilitySlider.wholeNumbers = false;
m_VadProbabilitySlider.value = _currentVadProbability;
var fill = ResolveVadProbabilityFillImage();
if (fill == null)
return;
fill.color = _currentVadProbability >= m_VadThreshold
? new Color32(0, 255, 0, 255)
: new Color32(255, 255, 255, 255);
}
private Image ResolveVadProbabilityFillImage()
{
if (m_VadProbabilityFill != null)
return m_VadProbabilityFill;
if (m_VadProbabilitySlider == null || m_VadProbabilitySlider.fillRect == null)
return null;
m_VadProbabilityFill = m_VadProbabilitySlider.fillRect.GetComponent<Image>();
return m_VadProbabilityFill;
}
private void UpdateFpsText()
{
if (m_TextFps == null)
return;
_fpsElapsed += Time.unscaledDeltaTime;
_fpsFrameCount++;
var interval = Mathf.Max(0.05f, FPS_UPDATE_INTERVAL);
if (_fpsElapsed < interval)
return;
var fps = Mathf.RoundToInt(_fpsFrameCount / _fpsElapsed);
m_TextFps.text = $"FPS: {fps}";
_fpsElapsed = 0f;
_fpsFrameCount = 0;
}
private void OnFinalResultReceived(string final)
{
if (_isListeningSession)
{
if (!IsMicrophoneRecording(_selectedMicrophone))
StartMicrophone();
if (_currentState != State.Speaking)
SetState(State.Listening);
}
else
{
SetState(State.Ready);
}
var cleaned = SanitizeTranscriptText(final);
if (string.IsNullOrWhiteSpace(cleaned))
return;
OnSpeechTextReceived?.Invoke(cleaned);
}
private static string SanitizeTranscriptText(string text)
{
if (string.IsNullOrEmpty(text))
return string.Empty;
if (text.IndexOf('\uFFFD') < 0)
return text;
return text.Replace("\uFFFD", string.Empty);
}
private class CircularBuffer
{
private readonly float[] _buffer;
private int _head;
private int _tail;
private readonly int _capacity;
public int Count { get; private set; }
public CircularBuffer(int capacity)
{
_capacity = capacity;
_buffer = new float[capacity];
Clear();
}
public void Write(float[] data, int length)
{
for (int i = 0; i < length; i++)
{
_buffer[_tail] = data[i];
_tail = (_tail + 1) % _capacity;
if (Count == _capacity)
{
// Buffer is full: drop the oldest sample so the latest stream stays contiguous.
_head = (_head + 1) % _capacity;
}
else
{
Count++;
}
}
}
public void Read(float[] destination, int length)
{
if (length > Count) throw new InvalidOperationException("Not enough data to read.");
for (int i = 0; i < length; i++)
{
destination[i] = _buffer[_head];
_head = (_head + 1) % _capacity;
}
Count -= length;
}
public void Clear()
{
_head = 0;
_tail = 0;
Count = 0;
}
}
}