| import torch | |
| from transformers import EncodecModel, AutoProcessor | |
| class AudioTokenizer: | |
| def __init__(self, model_id="facebook/encodec_24khz"): | |
| # Load a high-quality neural audio codec | |
| self.model = EncodecModel.from_pretrained(model_id) | |
| self.processor = AutoProcessor.from_pretrained(model_id) | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.model.to(self.device) | |
| def encode(self, audio_array, sampling_rate=24000): | |
| """Turns raw audio into a sequence of discrete tokens.""" | |
| inputs = self.processor( | |
| raw_audio=audio_array, | |
| sampling_rate=sampling_rate, | |
| return_tensors="pt" | |
| ).to(self.device) | |
| with torch.no_grad(): | |
| # Encodes audio into codes of shape [nb_frames, batch, codebooks, len] | |
| codes = self.model.encode(**inputs).audio_codes | |
| return codes | |
| def decode(self, codes): | |
| """Turns tokens back into playable audio (for TTS).""" | |
| with torch.no_grad(): | |
| # Reconstructs the original waveform from discrete indices | |
| audio_values = self.model.decode(codes, [None]) | |
| return audio_values.audio_values | |
| # --- Example Usage --- | |
| # import librosa | |
| # audio, sr = librosa.load("hello.wav", sr=24000) | |
| # tokenizer = AudioTokenizer() | |
| # tokens = tokenizer.encode(audio) | |
| # print(f"Audio converted to tokens: {tokens.shape}") |