Mini-gpt-0.000001 / audio_tokenizer.py
AIencoder's picture
Create audio_tokenizer.py
7f1a2b7 verified
import torch
from transformers import EncodecModel, AutoProcessor
class AudioTokenizer:
def __init__(self, model_id="facebook/encodec_24khz"):
# Load a high-quality neural audio codec
self.model = EncodecModel.from_pretrained(model_id)
self.processor = AutoProcessor.from_pretrained(model_id)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
def encode(self, audio_array, sampling_rate=24000):
"""Turns raw audio into a sequence of discrete tokens."""
inputs = self.processor(
raw_audio=audio_array,
sampling_rate=sampling_rate,
return_tensors="pt"
).to(self.device)
with torch.no_grad():
# Encodes audio into codes of shape [nb_frames, batch, codebooks, len]
codes = self.model.encode(**inputs).audio_codes
return codes
def decode(self, codes):
"""Turns tokens back into playable audio (for TTS)."""
with torch.no_grad():
# Reconstructs the original waveform from discrete indices
audio_values = self.model.decode(codes, [None])
return audio_values.audio_values
# --- Example Usage ---
# import librosa
# audio, sr = librosa.load("hello.wav", sr=24000)
# tokenizer = AudioTokenizer()
# tokens = tokenizer.encode(audio)
# print(f"Audio converted to tokens: {tokens.shape}")