Initial commit

6ac63e1 24 days ago

21.9 kB

	using UnityEngine;
	using System;
	using System.Runtime.InteropServices;
	using System.Threading.Tasks;
	using Unity.InferenceEngine;
	using UnityEngine.UI;

	public class ASRManager : MonoBehaviour
	{
	private bool _platformMicrophoneSupported = true;

	public enum State
	{
	Initializing, Ready, Listening, Speaking, STTProcessing, Error
	}
	private State _currentState = State.Initializing;
	public State currentState => _currentState;

	public enum InferenceBackend
	{
	GPUCompute = 0,
	CPU = 1
	}

	[SerializeField] private Slider m_VadProbabilitySlider;
	[SerializeField] private Image m_VadProbabilityFill;
	[SerializeField] private Text m_TextFps;
	[SerializeField] private MonoBehaviour m_AsrRunnerComponent;
	[SerializeField, Range(0f, 1f)] private float m_VadThreshold = 0.5f;
	[SerializeField, Min(1)] private int m_PrePostBufferFrames = 20;
	[SerializeField, Min(0.1f)] private float m_MaxAudioStreamSeconds = 10f;
	private IASRRunner _activeRunner;

	private float _currentVadProbability;

	private TenVADRunner _vad;
	private string _selectedMicrophone;
	private AudioClip _microphoneClip;
	private int _lastPosition = 0;
	private int _consecutiveSilenceFrames = 0;
	private float _currentRecordingTime = 0f;
	private bool _isListeningSession;
	private bool _webglMicrophoneInitialized;
	#if UNITY_WEBGL && !UNITY_EDITOR
	private bool _webglMicPluginAvailable = true;
	private bool _webglMicPluginWarningLogged;
	#endif

	private const int HOP_SIZE = 256;
	private const int TARGET_SAMPLE_RATE = 16000;
	private const float FPS_UPDATE_INTERVAL = 0.25f;
	private const float VAD_TEXT_UPDATE_INTERVAL = 0.05f;
	private const int MAX_CHUNKS_PER_FRAME = 24;

	#if UNITY_WEBGL && !UNITY_EDITOR
	[DllImport("__Internal")]
	private static extern int WebGLMic_Start(int sampleRate);
	[DllImport("__Internal")]
	private static extern void WebGLMic_Stop();
	[DllImport("__Internal")]
	private static extern int WebGLMic_GetSamples(float[] buffer, int maxSamples);
	[DllImport("__Internal")]
	private static extern int WebGLMic_IsRecording();
	#endif

	private CircularBuffer _microphoneCircularBuffer;
	private CircularBuffer _preSpeechCircularBuffer;
	private float[] _reusableReadBuffer;
	private float[] _reusableProcessChunk;
	private short[] _reusableShortChunk;

	private float _fpsElapsed;
	private int _fpsFrameCount;
	private float _nextVadTextUpdateTime;

	// Event for state changes
	public event Action<State> OnStateChanged;
	public event Action<string> OnSpeechTextReceived;

	private async void Start()
	{
	SetState(State.Initializing);
	try
	{
	InitializeBuffers();
	await InitializeASRRunner();
	_vad = new TenVADRunner((UIntPtr)HOP_SIZE, m_VadThreshold);
	UpdateVadProbabilityText();

	InitializeMicrophone();
	SetState(State.Ready);
	}
	catch (Exception e)
	{
	Debug.LogError($"[ASRManager] Initialization failed: {e.Message}\n{e.StackTrace}");
	SetState(State.Error);
	}
	}

	private void Reset()
	{
	if (m_AsrRunnerComponent == null)
	m_AsrRunnerComponent = ResolveASRRunnerComponent();
	}

	#if UNITY_EDITOR
	private void OnValidate()
	{
	m_VadThreshold = Mathf.Clamp01(m_VadThreshold);
	m_PrePostBufferFrames = Mathf.Max(1, m_PrePostBufferFrames);
	m_MaxAudioStreamSeconds = Mathf.Max(0.1f, m_MaxAudioStreamSeconds);
	if (m_AsrRunnerComponent == null)
	m_AsrRunnerComponent = ResolveASRRunnerComponent();
	}
	#endif

	private void Update()
	{
	UpdateFpsText();

	if (_currentState == State.Listening \|\| _currentState == State.Speaking)
	{
	ReadMicrophoneData();
	ProcessAudioChunks();
	CheckMicrophoneStatus();
	}
	}

	private void OnDestroy()
	{
	if (_activeRunner != null)
	{
	_activeRunner.OnFinalResult -= OnFinalResultReceived;
	}

	if (!string.IsNullOrEmpty(_selectedMicrophone) &&
	IsMicrophoneRecording(_selectedMicrophone))
	{
	EndMicrophone(_selectedMicrophone);
	}

	_vad?.Dispose();
	_activeRunner?.Dispose();
	}

	public bool TrySetInferenceBackend(InferenceBackend backend)
	{
	if (_activeRunner == null \|\| _isReinitializing)
	return false;

	if (_currentState == State.STTProcessing)
	return false;

	if (_currentState == State.Listening \|\| _currentState == State.Speaking)
	StopListening(processCurrentSegment: false);

	_activeRunner.SetPreferredBackend(ToRunnerBackend(backend));
	_ = ReinitializeRunnerAsync();
	return true;
	}

	private bool _isReinitializing;

	private async Task ReinitializeRunnerAsync()
	{
	if (_isReinitializing)
	return;

	_isReinitializing = true;
	SetState(State.Initializing);
	try
	{
	await _activeRunner.ReinitializeAsync();
	SetState(State.Ready);
	}
	catch (Exception e)
	{
	Debug.LogError($"[ASRManager] Reinitialization failed: {e.Message}\n{e.StackTrace}");
	SetState(State.Error);
	}
	finally
	{
	_isReinitializing = false;
	}
	}

	private static BackendType ToRunnerBackend(InferenceBackend backend)
	{
	return backend == InferenceBackend.CPU ? BackendType.CPU : BackendType.GPUCompute;
	}

	public bool Listen()
	{
	switch (_currentState)
	{
	case State.Listening:
	case State.Speaking:
	return true;
	case State.STTProcessing:
	case State.Error:
	case State.Initializing:
	return false;
	}

	if (!_platformMicrophoneSupported)
	{
	return false;
	}

	_isListeningSession = true;
	StartMicrophone();
	if (_microphoneClip == null && !IsMicrophoneRecording(_selectedMicrophone))
	return false;

	SetState(State.Listening);
	return true;
	}

	public bool StopListening(bool processCurrentSegment = false)
	{
	switch (_currentState)
	{
	case State.Ready:
	case State.STTProcessing:
	case State.Error:
	case State.Initializing:
	return false;
	}

	_isListeningSession = false;

	if (_currentState == State.Listening)
	{
	StopMicrophone();
	_consecutiveSilenceFrames = 0;
	_currentRecordingTime = 0f;
	_preSpeechCircularBuffer.Clear();
	_activeRunner.CancelSpeechSegment();
	SetState(State.Ready);
	return true;
	}

	if (_currentState == State.Speaking)
	{
	if (processCurrentSegment)
	{
	_activeRunner.EndSpeechSegment();
	}
	else
	{
	_activeRunner.CancelSpeechSegment();
	}
	_preSpeechCircularBuffer.Clear();
	StopMicrophone();
	SetState(State.Ready);
	_consecutiveSilenceFrames = 0;
	_currentRecordingTime = 0f;
	return true;
	}

	return false;
	}

	private void InitializeBuffers()
	{
	var bufferFrames = Mathf.Max(1, m_PrePostBufferFrames);
	_microphoneCircularBuffer = new CircularBuffer(TARGET_SAMPLE_RATE * 2);
	_preSpeechCircularBuffer = new CircularBuffer(HOP_SIZE * bufferFrames);
	_reusableReadBuffer = new float[HOP_SIZE * 4];
	_reusableProcessChunk = new float[HOP_SIZE];
	_reusableShortChunk = new short[HOP_SIZE];
	}

	private async Task InitializeASRRunner()
	{
	if (m_AsrRunnerComponent == null)
	m_AsrRunnerComponent = ResolveASRRunnerComponent();

	if (m_AsrRunnerComponent == null)
	{
	throw new ArgumentNullException("ASR Runner Component is not assigned in the Inspector.");
	}

	_activeRunner = m_AsrRunnerComponent as IASRRunner;
	if (_activeRunner == null)
	{
	throw new InvalidCastException($"The component '{m_AsrRunnerComponent.GetType().Name}' must implement IASRRunner.");
	}

	_activeRunner.OnFinalResult += OnFinalResultReceived;
	await _activeRunner.Initialize();
	}

	private MonoBehaviour ResolveASRRunnerComponent()
	{
	// Prefer a runner on the same GameObject to avoid cross-scene mismatches.
	var localBehaviours = GetComponents<MonoBehaviour>();
	foreach (var behaviour in localBehaviours)
	{
	if (behaviour is IASRRunner)
	return behaviour;
	}

	#if UNITY_2023_1_OR_NEWER
	var allBehaviours = FindObjectsByType<MonoBehaviour>(FindObjectsInactive.Include, FindObjectsSortMode.None);
	#else
	var allBehaviours = FindObjectsOfType<MonoBehaviour>(true);
	#endif
	foreach (var behaviour in allBehaviours)
	{
	if (behaviour is IASRRunner)
	return behaviour;
	}

	return null;
	}

	private void InitializeMicrophone()
	{
	if (!_platformMicrophoneSupported)
	throw new NotSupportedException("Microphone API is not available on this platform.");

	#if UNITY_WEBGL && !UNITY_EDITOR
	if (_webglMicrophoneInitialized)
	return;

	_selectedMicrophone = "WebGL Microphone";
	_webglMicrophoneInitialized = true;
	return;
	#else
	string[] devices = Microphone.devices;
	if (devices.Length == 0)
	throw new InvalidOperationException("No microphone found.");

	_selectedMicrophone = devices[0];
	#endif
	}

	private void StartMicrophone()
	{
	if (!_platformMicrophoneSupported)
	return;

	if (IsMicrophoneRecording(_selectedMicrophone))
	return;

	if (string.IsNullOrEmpty(_selectedMicrophone))
	InitializeMicrophone();

	if (string.IsNullOrEmpty(_selectedMicrophone))
	{
	Debug.LogError("[ASRManager] No microphone is selected.");
	return;
	}

	#if UNITY_WEBGL && !UNITY_EDITOR
	int startResult;
	try
	{
	startResult = WebGLMic_Start(TARGET_SAMPLE_RATE);
	}
	catch
	{
	startResult = 0;
	}

	if (startResult != 1)
	{
	Debug.LogError($"[ASRManager] Failed to start WebGL microphone. Result={startResult}");
	return;
	}

	if (_microphoneClip == null)
	_microphoneClip = AudioClip.Create("webgl-microphone", TARGET_SAMPLE_RATE, 1, TARGET_SAMPLE_RATE, false);
	#else
	_microphoneClip = Microphone.Start(_selectedMicrophone, true, Mathf.Max(1, Mathf.CeilToInt(m_MaxAudioStreamSeconds) + 1), TARGET_SAMPLE_RATE);
	_lastPosition = 0;
	#endif
	}

	private void StopMicrophone()
	{
	if (_microphoneClip == null && !IsMicrophoneRecording(_selectedMicrophone))
	return;

	if (IsMicrophoneRecording(_selectedMicrophone))
	EndMicrophone(_selectedMicrophone);

	_microphoneClip = null;
	}

	private void ReadMicrophoneData()
	{
	if (_microphoneClip == null \|\| !IsMicrophoneRecording(_selectedMicrophone))
	return;

	#if UNITY_WEBGL && !UNITY_EDITOR
	int sampleCount = 0;
	try
	{
	sampleCount = WebGLMic_GetSamples(_reusableReadBuffer, _reusableReadBuffer.Length);
	}
	catch (Exception ex)
	{
	if (_webglMicPluginAvailable)
	{
	_webglMicPluginAvailable = false;
	if (!_webglMicPluginWarningLogged)
	{
	Debug.LogWarning($"[ASRManager] WebGL microphone plugin is not available: {ex.Message}");
	_webglMicPluginWarningLogged = true;
	}
	}
	return;
	}

	if (sampleCount <= 0)
	return;

	_microphoneCircularBuffer.Write(_reusableReadBuffer, sampleCount);
	#else
	int currentPosition = Microphone.GetPosition(_selectedMicrophone);
	if (currentPosition == _lastPosition) return;

	int sampleCount = (currentPosition > _lastPosition)
	? (currentPosition - _lastPosition)
	: (_microphoneClip.samples - _lastPosition + currentPosition);

	if (sampleCount > 0)
	{
	int remaining = sampleCount;
	int readPosition = _lastPosition;
	while (remaining > 0)
	{
	int readLength = Mathf.Min(remaining, _reusableReadBuffer.Length);
	_microphoneClip.GetData(_reusableReadBuffer, readPosition);
	_microphoneCircularBuffer.Write(_reusableReadBuffer, readLength);
	remaining -= readLength;
	readPosition = (readPosition + readLength) % _microphoneClip.samples;
	}
	}
	_lastPosition = currentPosition;
	#endif
	}

	private void ProcessAudioChunks()
	{
	var availableChunks = _microphoneCircularBuffer.Count / HOP_SIZE;
	if (availableChunks <= 0)
	return;

	var frameChunkBudget = Mathf.Clamp(availableChunks, 1, Mathf.Max(1, MAX_CHUNKS_PER_FRAME));
	int chunksProcessed = 0;
	while (_microphoneCircularBuffer.Count >= HOP_SIZE && chunksProcessed < frameChunkBudget)
	{
	_microphoneCircularBuffer.Read(_reusableProcessChunk, HOP_SIZE);

	for (int i = 0; i < HOP_SIZE; i++)
	{
	_reusableShortChunk[i] = (short)(_reusableProcessChunk[i] * 32767.0f);
	}
	_vad.Process(_reusableShortChunk, out float probability, out int flag);
	UpdateVadDebug(probability);
	bool voiceDetected = flag == 1;

	switch (_currentState)
	{
	case State.Listening:
	_preSpeechCircularBuffer.Write(_reusableProcessChunk, HOP_SIZE);
	if (voiceDetected)
	{
	StartSpeech();
	}
	break;

	case State.Speaking:
	ProcessSpeechChunk(_reusableProcessChunk);
	_currentRecordingTime += (float)HOP_SIZE / TARGET_SAMPLE_RATE;
	if (voiceDetected)
	{
	_consecutiveSilenceFrames = 0;
	}
	else
	{
	_consecutiveSilenceFrames++;
	if (_consecutiveSilenceFrames >= Mathf.Max(1, m_PrePostBufferFrames))
	{
	EndSpeech();
	}
	}
	if (_currentRecordingTime >= Mathf.Max(0.1f, m_MaxAudioStreamSeconds))
	{
	EndSpeech();
	}
	break;
	}
	chunksProcessed++;
	}
	}

	private void StartSpeech()
	{
	SetState(State.Speaking);
	_currentRecordingTime = 0f;
	_consecutiveSilenceFrames = 0;
	_activeRunner.StartSpeechSegment();

	// Process pre-speech buffer data
	int preSpeechDataLength = _preSpeechCircularBuffer.Count;
	while (preSpeechDataLength > 0)
	{
	int chunkLength = Mathf.Min(HOP_SIZE, preSpeechDataLength);
	_preSpeechCircularBuffer.Read(_reusableProcessChunk, chunkLength);
	ProcessSpeechChunk(_reusableProcessChunk, chunkLength);
	preSpeechDataLength -= chunkLength;
	}
	}

	private void ProcessSpeechChunk(float[] audioChunk, int length = -1)
	{
	int sampleCount = (length <= 0 \|\| length > audioChunk.Length) ? audioChunk.Length : length;
	_activeRunner.ProcessAudioChunk(audioChunk, sampleCount);
	}

	private void EndSpeech()
	{
	if (_currentState != State.Speaking)
	return;

	_preSpeechCircularBuffer.Clear();
	_activeRunner.EndSpeechSegment();
	_consecutiveSilenceFrames = 0;
	_currentRecordingTime = 0f;

	if (_isListeningSession)
	{
	// Keep microphone + VAD running continuously while inference consumes queued segments.
	SetState(State.Listening);
	return;
	}

	StopMicrophone();
	SetState(State.STTProcessing);
	}

	private void SetState(State newState)
	{
	if (_currentState == newState) return;
	_currentState = newState;

	OnStateChanged?.Invoke(newState);
	}

	private void CheckMicrophoneStatus()
	{
	if (!string.IsNullOrEmpty(_selectedMicrophone) && !IsMicrophoneRecording(_selectedMicrophone))
	{
	StartMicrophone();
	}
	}

	private static bool IsMicrophoneRecording(string deviceName)
	{
	#if UNITY_WEBGL && !UNITY_EDITOR
	try
	{
	return WebGLMic_IsRecording() == 1;
	}
	catch
	{
	return false;
	}
	#else
	return Microphone.IsRecording(deviceName);
	#endif
	}

	private static void EndMicrophone(string deviceName)
	{
	#if UNITY_WEBGL && !UNITY_EDITOR
	try
	{
	WebGLMic_Stop();
	}
	catch
	{
	// Ignore if plugin call fails.
	}
	#else
	Microphone.End(deviceName);
	#endif
	}

	private void UpdateVadDebug(float probability)
	{
	_currentVadProbability = Mathf.Clamp01(probability);
	UpdateVadProbabilityText();
	}

	private void UpdateVadProbabilityText()
	{
	if (m_VadProbabilitySlider == null)
	return;

	var interval = Mathf.Max(0.01f, VAD_TEXT_UPDATE_INTERVAL);
	var now = Time.unscaledTime;
	if (now < _nextVadTextUpdateTime)
	return;

	_nextVadTextUpdateTime = now + interval;
	m_VadProbabilitySlider.minValue = 0f;
	m_VadProbabilitySlider.maxValue = 1f;
	m_VadProbabilitySlider.wholeNumbers = false;
	m_VadProbabilitySlider.value = _currentVadProbability;

	var fill = ResolveVadProbabilityFillImage();
	if (fill == null)
	return;

	fill.color = _currentVadProbability >= m_VadThreshold
	? new Color32(0, 255, 0, 255)
	: new Color32(255, 255, 255, 255);
	}

	private Image ResolveVadProbabilityFillImage()
	{
	if (m_VadProbabilityFill != null)
	return m_VadProbabilityFill;

	if (m_VadProbabilitySlider == null \|\| m_VadProbabilitySlider.fillRect == null)
	return null;

	m_VadProbabilityFill = m_VadProbabilitySlider.fillRect.GetComponent<Image>();
	return m_VadProbabilityFill;
	}

	private void UpdateFpsText()
	{
	if (m_TextFps == null)
	return;

	_fpsElapsed += Time.unscaledDeltaTime;
	_fpsFrameCount++;

	var interval = Mathf.Max(0.05f, FPS_UPDATE_INTERVAL);
	if (_fpsElapsed < interval)
	return;

	var fps = Mathf.RoundToInt(_fpsFrameCount / _fpsElapsed);
	m_TextFps.text = $"FPS: {fps}";

	_fpsElapsed = 0f;
	_fpsFrameCount = 0;
	}

	private void OnFinalResultReceived(string final)
	{
	if (_isListeningSession)
	{
	if (!IsMicrophoneRecording(_selectedMicrophone))
	StartMicrophone();

	if (_currentState != State.Speaking)
	SetState(State.Listening);
	}
	else
	{
	SetState(State.Ready);
	}

	var cleaned = SanitizeTranscriptText(final);
	if (string.IsNullOrWhiteSpace(cleaned))
	return;

	OnSpeechTextReceived?.Invoke(cleaned);
	}

	private static string SanitizeTranscriptText(string text)
	{
	if (string.IsNullOrEmpty(text))
	return string.Empty;

	if (text.IndexOf('\uFFFD') < 0)
	return text;

	return text.Replace("\uFFFD", string.Empty);
	}

	private class CircularBuffer
	{
	private readonly float[] _buffer;
	private int _head;
	private int _tail;
	private readonly int _capacity;
	public int Count { get; private set; }

	public CircularBuffer(int capacity)
	{
	_capacity = capacity;
	_buffer = new float[capacity];
	Clear();
	}

	public void Write(float[] data, int length)
	{
	for (int i = 0; i < length; i++)
	{
	_buffer[_tail] = data[i];
	_tail = (_tail + 1) % _capacity;
	if (Count == _capacity)
	{
	// Buffer is full: drop the oldest sample so the latest stream stays contiguous.
	_head = (_head + 1) % _capacity;
	}
	else
	{
	Count++;
	}
	}
	}

	public void Read(float[] destination, int length)
	{
	if (length > Count) throw new InvalidOperationException("Not enough data to read.");
	for (int i = 0; i < length; i++)
	{
	destination[i] = _buffer[_head];
	_head = (_head + 1) % _capacity;
	}
	Count -= length;
	}

	public void Clear()
	{
	_head = 0;
	_tail = 0;
	Count = 0;
	}
	}
	}