# Copyright (c) 2023 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # This code is modified from https://huggingface.co/m-a-p/MERT-v1-330M import torch from tqdm import tqdm import numpy as np from transformers import Wav2Vec2FeatureExtractor from transformers import AutoModel import torchaudio import torchaudio.transforms as T from sklearn.preprocessing import StandardScaler def mert_encoder(model, processor, audio_path, hps): """ # mert default sr: 24000 """ with torch.no_grad(): resample_rate = processor.sampling_rate device = next(model.parameters()).device input_audio, sampling_rate = torchaudio.load(audio_path) input_audio = input_audio.squeeze() if sampling_rate != resample_rate: resampler = T.Resample(sampling_rate, resample_rate) input_audio = resampler(input_audio) inputs = processor( input_audio, sampling_rate=resample_rate, return_tensors="pt" ).to( device ) # {input_values: tensor, attention_mask: tensor} outputs = model(**inputs, output_hidden_states=True) # list: len is 25 # [25 layer, Time steps, 1024 feature_dim] # all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze() # mert_features.append(all_layer_hidden_states) feature = outputs.hidden_states[ hps.mert_feature_layer ].squeeze() # [1, frame len, 1024] -> [frame len, 1024] return feature.cpu().detach().numpy() def mert_features_normalization(raw_mert_features): normalized_mert_features = list() mert_features = np.array(raw_mert_features) scaler = StandardScaler().fit(mert_features) for raw_mert_feature in raw_mert_feature: normalized_mert_feature = scaler.transform(raw_mert_feature) normalized_mert_features.append(normalized_mert_feature) return normalized_mert_features def get_mapped_mert_features(raw_mert_features, mapping_features, fast_mapping=True): source_hop = 320 target_hop = 256 factor = np.gcd(source_hop, target_hop) source_hop //= factor target_hop //= factor print( "Mapping source's {} frames => target's {} frames".format( target_hop, source_hop ) ) mert_features = [] for index, mapping_feat in enumerate(tqdm(mapping_features)): # mapping_feat: (mels_frame_len, n_mels) target_len = mapping_feat.shape[0] # (frame_len, 1024) raw_feats = raw_mert_features[index].cpu().numpy() source_len, width = raw_feats.shape # const ~= target_len * target_hop const = source_len * source_hop // target_hop * target_hop # (source_len * source_hop, dim) up_sampling_feats = np.repeat(raw_feats, source_hop, axis=0) # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim) down_sampling_feats = np.average( up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1 ) err = abs(target_len - len(down_sampling_feats)) if err > 3: print("index:", index) print("mels:", mapping_feat.shape) print("raw mert vector:", raw_feats.shape) print("up_sampling:", up_sampling_feats.shape) print("const:", const) print("down_sampling_feats:", down_sampling_feats.shape) exit() if len(down_sampling_feats) < target_len: # (1, dim) -> (err, dim) end = down_sampling_feats[-1][None, :].repeat(err, axis=0) down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0) # (target_len, dim) feats = down_sampling_feats[:target_len] mert_features.append(feats) return mert_features def load_mert_model(hps): print("Loading MERT Model: ", hps.mert_model) # Load model model_name = hps.mert_model model = AutoModel.from_pretrained(model_name, trust_remote_code=True) if torch.cuda.is_available(): model = model.cuda() # model = model.eval() preprocessor = Wav2Vec2FeatureExtractor.from_pretrained( model_name, trust_remote_code=True ) return model, preprocessor # loading the corresponding preprocessor config # def load_preprocessor (model_name="m-a-p/MERT-v1-330M"): # print('load_preprocessor...') # preprocessor = Wav2Vec2FeatureExtractor.from_pretrained(model_name,trust_remote_code=True) # return preprocessor