moshi_general / inference.py

Update inference.py

5acce69 verified about 2 months ago

10.2 kB

	import torch
	import numpy as np
	import torchaudio
	import sentencepiece
	import logging
	from pathlib import Path
	from moshi.models import loaders, LMGen

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class InferenceRecipe:
	"""Handles model inference for the Any-to-Any model."""

	def __init__(self, model_path: str, device: str='cuda'):
	"""Initialize the model.

	Args:
	model_path (str): Path to model directory with pre-downloaded files
	device (str): Device to run on ('cuda' or 'cpu')
	"""
	self.device = torch.device(device)
	self.model_path = Path(model_path)

	# Set sample rate and frame rate
	self.sample_rate = 24000 # Based on model config in loaders.py
	self.frame_rate = 12.5 # Based on model config in loaders.py

	# Initialize all model components
	logger.info(f"Initializing models from {model_path}")
	self.mimi, self.text_tokenizer, self.lm_gen = self._initialize_models()
	self.mimi = self.mimi.to(self.device)
	self.lm_gen = self.lm_gen.to(self.device)
	logger.info("Model initialization complete")

	def _initialize_models(self):
	"""Initialize all required model components."""
	print("Initializing models...")

	try:
	# Load MIMI model for encoding/decoding
	mimi_path = self.model_path / loaders.MIMI_NAME
	if not mimi_path.exists():
	raise RuntimeError(f"MIMI model not found at {mimi_path}")
	logger.info(f"Loading MIMI model from {mimi_path}")
	mimi = loaders.get_mimi(str(mimi_path), device=self.device)
	mimi.set_num_codebooks(8)

	# Load text tokenizer
	tokenizer_path = self.model_path / loaders.TEXT_TOKENIZER_NAME
	if not tokenizer_path.exists():
	raise RuntimeError(f"Text tokenizer not found at {tokenizer_path}")
	logger.info(f"Loading text tokenizer from {tokenizer_path}")
	text_tokenizer = sentencepiece.SentencePieceProcessor(str(tokenizer_path))

	# Load language model
	moshi_path = self.model_path / loaders.MOSHI_NAME
	if not moshi_path.exists():
	raise RuntimeError(f"Language model not found at {moshi_path}")
	logger.info(f"Loading language model from {moshi_path}")
	moshi = loaders.get_moshi_lm(str(moshi_path), device=self.device)
	lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)

	return mimi, text_tokenizer, lm_gen

	except Exception as e:
	logger.error(f"Model initialization failed: {str(e)}")
	raise

	def _load_audio(self, audio_array: np.ndarray, sample_rate: int):
	"""Load and preprocess audio."""
	try:
	# Convert to tensor
	wav = torch.from_numpy(audio_array).float().unsqueeze(0)

	# Resample if needed
	if sample_rate != self.sample_rate:
	logger.info(f"Resampling from {sample_rate} to {self.sample_rate}")
	# Create resampler on same device as input will be
	resampler = torchaudio.transforms.Resample(
	orig_freq=sample_rate,
	new_freq=self.sample_rate
	).to(self.device)
	# Move wav to device before resampling
	wav = resampler(wav.to(self.device))
	else:
	# If no resampling needed, still ensure wav is on correct device
	wav = wav.to(self.device)

	# Ensure frame alignment
	frame_size = int(self.sample_rate / self.frame_rate)
	orig_length = wav.shape[-1]
	wav = wav[:, :, :(wav.shape[-1] // frame_size) * frame_size]
	if wav.shape[-1] != orig_length:
	logger.info(f"Trimmed audio from {orig_length} to {wav.shape[-1]} samples for frame alignment")

	return wav

	except Exception as e:
	logger.error(f"Audio loading failed: {str(e)}")
	raise

	def _pad_codes(self, all_codes, time_seconds=30):
	try:
	min_frames = int(time_seconds * self.frame_rate)
	frame_size = int(self.sample_rate / self.frame_rate)

	if len(all_codes) < min_frames:
	frames_to_add = min_frames - len(all_codes)
	logger.info(f"Padding {frames_to_add} frames to reach minimum length")
	with torch.no_grad(), self.mimi.streaming(batch_size=1):
	# Create tensor on the correct device
	chunk = torch.zeros(1, 1, frame_size, dtype=torch.float32, device=self.device)
	for _ in range(frames_to_add):
	additional_code = self.mimi.encode(chunk)
	all_codes.append(additional_code)

	return all_codes

	except Exception as e:
	logger.error(f"Code padding failed: {str(e)}")
	raise

	def _encode_audio(self, wav: torch.Tensor):
	"""Convert audio to codes."""
	try:
	frame_size = int(self.sample_rate / self.frame_rate)
	all_codes = []

	with torch.no_grad(), self.mimi.streaming(batch_size=1):
	for offset in range(0, wav.shape[-1], frame_size):
	frame = wav[:, :, offset: offset + frame_size]
	codes = self.mimi.encode(frame.to(self.device))
	assert codes.shape[-1] == 1, f"Expected code shape (, , 1), got {codes.shape}"
	all_codes.append(codes)

	logger.info(f"Encoded {len(all_codes)} frames")
	return all_codes

	except Exception as e:
	logger.error(f"Audio encoding failed: {str(e)}")
	raise

	def _warmup(self):
	"""Run a warmup pass."""
	try:
	frame_size = int(self.sample_rate / self.frame_rate)
	# Create tensor on the correct device from the start
	chunk = torch.zeros(1, 1, frame_size, dtype=torch.float32, device=self.device)

	with torch.no_grad(), self.lm_gen.streaming(1), self.mimi.streaming(1):
	codes = self.mimi.encode(chunk) # chunk already on correct device
	tokens = self.lm_gen.step(codes[:, :, 0:1])
	if tokens is not None:
	_ = self.mimi.decode(tokens[:, 1:])

	if self.device.type == 'cuda':
	torch.cuda.synchronize()
	logger.info("Warmup pass completed")

	except Exception as e:
	logger.error(f"Warmup failed: {str(e)}")
	raise

	def _generate(self, all_codes):
	"""Generate audio and text from codes."""
	try:
	out_wav_chunks = []
	text_output = []

	with torch.no_grad(), self.lm_gen.streaming(1), self.mimi.streaming(1):
	for i, code in enumerate(all_codes):
	assert code.shape == (1, 8, 1), f"Expected code shape (1, 8, 1), got {code.shape}"
	tokens_out = self.lm_gen.step(code.to(self.device))

	if tokens_out is not None:
	# Generate audio
	wav_chunk = self.mimi.decode(tokens_out[:, 1:])
	out_wav_chunks.append(wav_chunk)

	# Generate text if available
	text_token = tokens_out[0, 0, 0].item()
	if text_token not in (0, 3):
	_text = self.text_tokenizer.id_to_piece(text_token)
	_text = _text.replace("▁", " ")
	text_output.append(_text)

	if (i + 1) % 100 == 0:
	logger.info(f"Processed {i + 1}/{len(all_codes)} frames")

	wav = torch.cat(out_wav_chunks, dim=-1)
	text = ''.join(text_output)

	logger.info(f"Generated {wav.shape[-1]} samples of audio and {len(text)} characters of text")
	return wav, text

	except Exception as e:
	logger.error(f"Generation failed: {str(e)}")
	raise

	def inference(self, audio_array: np.ndarray, sample_rate: int) -> dict:
	"""Run inference on input audio.

	Args:
	audio_array (np.ndarray): Input audio as numpy array
	sample_rate (int): Sample rate of input audio

	Returns:
	dict: Contains generated audio array and optional transcribed text
	"""
	try:
	logger.info(f"Starting inference on {len(audio_array)} samples at {sample_rate} Hz, self device: {self.device}")

	# Load and preprocess audio
	wav = self._load_audio(audio_array, sample_rate)
	wav = wav.to(self.device)

	# Convert to codes
	all_codes = self._encode_audio(wav)
	all_codes = self._pad_codes(all_codes)

	# Warmup pass
	self._warmup()

	# Generate output
	out_wav, text = self._generate(all_codes)

	# Convert output to numpy
	output = out_wav.cpu().numpy().squeeze()

	logger.info("Inference completed successfully")
	return {
	"audio": output,
	"text": text
	}

	except Exception as e:
	logger.error(f"Inference failed: {str(e)}")
	raise

	if __name__ == "__main__":
	# Example usage
	import librosa

	# Initialize model
	model = InferenceRecipe("/path/to/models", device="cuda")

	# Load test audio
	audio, sr = librosa.load("test.wav", sr=None)

	# Run inference
	result = model.inference(audio, sr)
	print(f"Generated {len(result['audio'])} samples of audio")
	print(f"Generated text: {result['text']}")