Marcos12886 commited on
Commit
33c23f4
·
verified ·
1 Parent(s): 1e6dc54

model.py: probar cargando datasets hf

Browse files
Files changed (1) hide show
  1. model.py +124 -57
model.py CHANGED
@@ -1,104 +1,171 @@
1
- import torch
2
- import numpy as np
3
  import os
 
 
 
 
 
4
  from huggingface_hub import login, upload_folder
5
- from datasets import load_dataset, Audio
6
  from transformers.integrations import TensorBoardCallback
 
7
  from transformers import (
8
- Wav2Vec2FeatureExtractor, AutoModelForAudioClassification,
9
  Trainer, TrainingArguments,
10
  EarlyStoppingCallback
11
  )
12
- import json
13
- # SE USA FLOAT32 EN EL MODELO ORIGINAL
14
  MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto
15
  FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
16
  seed = 123
17
  MAX_DURATION = 1.00
18
- SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000 # antes estaba float16
19
- token = os.getenv('MODEL_REPO_ID')
20
  config_file = "models_config.json"
21
  clasificador = "class"
22
  monitor = "mon"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  def seed_everything():
25
- np.random.seed(seed)
26
  torch.manual_seed(seed)
27
  torch.cuda.manual_seed(seed)
28
  torch.backends.cudnn.deterministic = True
29
  torch.backends.cudnn.benchmark = False
30
  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16384:8'
31
 
32
- def preprocess_audio(audio_arrays, batch=True):
33
- if batch:
34
- audios = [x["array"] for x in audio_arrays["audio"]] # para usar aquí
35
- else:
36
- audios = [audio_arrays] # para usar en realtime.py
37
- inputs = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)(
38
- raw_speech=audios,
39
- sampling_rate=SAMPLING_RATE,
40
- return_tensors="pt", # Devolver tensores de PyTorch
41
- max_length=int(SAMPLING_RATE * MAX_DURATION), # Necesario para truncation
42
- truncation=True, # Muchísimo más rápido.
43
- padding=True, # Vectores igual longitud
44
- do_normalize=True, # No afecta 1ª época, no sé si necesario
45
- # return_attention_mask=True, # Empeora 1ª época. No sé si necesario
46
- padding_value=0.0, # No afecta 1ª época, no sé si necesario
47
- float=32 # No afecta 1ª época, no sé si necesario
48
- )
49
- return inputs
50
 
51
- def load_and_prepare_dataset(dataset_path):
52
- dataset = load_dataset(dataset_path, split="train") # Split para que no ponga train de primeras
53
- # dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE)) # Da mejor accuracy pero creo que cambia el preprocesado.
54
- encoded_dataset = dataset.map(preprocess_audio, remove_columns=["audio"], batched=True) # num_proc hace q no vaya realtime
55
- labels = encoded_dataset.features["label"].names
56
- label2id = {label: str(i) for i, label in enumerate(labels)}
57
- id2label = {str(i): label for i, label in enumerate(labels)}
58
- encoded_dataset = encoded_dataset.train_test_split(test_size=0.2, seed=seed, stratify_by_column="label")
59
- return encoded_dataset, label2id, id2label
 
 
 
 
 
 
 
 
 
60
 
61
  def load_model(num_labels, label2id, id2label):
62
- model = AutoModelForAudioClassification.from_pretrained(
63
  MODEL,
64
  num_labels=num_labels,
65
  label2id=label2id,
66
- id2label=id2label
 
 
 
 
 
 
 
67
  )
 
68
  return model
69
 
70
  def model_params(dataset_path):
71
- login(token, add_to_git_credential=True)
72
- seed_everything()
73
- encoded_dataset, label2id, id2label = load_and_prepare_dataset(dataset_path)
74
- model = load_model(len(id2label), label2id, id2label)
75
- return model, encoded_dataset, id2label
76
 
77
  def compute_metrics(eval_pred):
78
- predictions = np.argmax(eval_pred.predictions, axis=1)
79
- references = eval_pred.label_ids
 
 
80
  return {
81
- "accuracy": np.mean(predictions == references),
 
 
 
82
  }
83
 
84
- def model_training(training_args, output_dir, dataset_path):
85
- model, encoded_dataset, _ = model_params(dataset_path)
86
- tensorboard_callback = TensorBoardCallback()
87
- early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)
88
  trainer = Trainer(
89
  model=model,
90
  args=training_args,
91
  compute_metrics=compute_metrics,
92
- train_dataset=encoded_dataset["train"],
93
- eval_dataset=encoded_dataset["test"],
94
- callbacks=[tensorboard_callback, early_stopping_callback]
95
  )
96
  torch.cuda.empty_cache() # liberar memoria de la GPU
97
  trainer.train() # se pueden modificar los parámetros para continuar el train
 
98
  trainer.push_to_hub(token=token) # Subir modelo a mi cuenta. Necesario para hacer la predicción, no sé por qué.
99
  trainer.save_model(output_dir) # para subir el modelo a Hugging Face. Necesario para hacer la predicción, no sé por qué.
100
  os.makedirs(output_dir, exist_ok=True) # Crear carpeta con el modelo si no existe
101
- upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}",folder_path=output_dir, token=token) # subir modelo a organización
102
 
103
  def load_config(model_name):
104
  with open(config_file, 'r') as f:
@@ -109,9 +176,9 @@ def load_config(model_name):
109
  return model_config
110
 
111
  if __name__ == "__main__":
112
- config = load_config(clasificador) # PARA CAMBIAR MODELOS
113
- # config = load_config(monitor) # PARA CAMBI
114
  training_args = config["training_args"]
115
  output_dir = config["output_dir"]
116
  dataset_path = config["dataset_path"]
117
- model_training(training_args, output_dir, dataset_path)
 
 
 
1
  import os
2
+ import json
3
+ import random
4
+ import torch
5
+ import torchaudio
6
+ from torch.utils.data import Dataset, DataLoader
7
  from huggingface_hub import login, upload_folder
 
8
  from transformers.integrations import TensorBoardCallback
9
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
10
  from transformers import (
11
+ Wav2Vec2FeatureExtractor, HubertConfig, HubertForSequenceClassification,
12
  Trainer, TrainingArguments,
13
  EarlyStoppingCallback
14
  )
15
+
 
16
  MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto
17
  FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
18
  seed = 123
19
  MAX_DURATION = 1.00
20
+ SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000
21
+ token = os.getenv("HF_TOKEN")
22
  config_file = "models_config.json"
23
  clasificador = "class"
24
  monitor = "mon"
25
+ batch_size = 16
26
+
27
+ class AudioDataset(Dataset):
28
+ def __init__(self, dataset_path, label2id):
29
+ self.dataset_path = dataset_path
30
+ self.label2id = label2id
31
+ self.file_paths = []
32
+ self.labels = []
33
+ for label_dir, label_id in self.label2id.items():
34
+ label_path = os.path.join(self.dataset_path, label_dir)
35
+ if os.path.isdir(label_path):
36
+ for file_name in os.listdir(label_path):
37
+ audio_path = os.path.join(label_path, file_name)
38
+ self.file_paths.append(audio_path)
39
+ self.labels.append(label_id)
40
+
41
+ def __len__(self):
42
+ return len(self.file_paths)
43
+
44
+ def __getitem__(self, idx):
45
+ audio_path = self.file_paths[idx]
46
+ label = self.labels[idx]
47
+ input_values = self.preprocess_audio(audio_path)
48
+ return {
49
+ "input_values": input_values,
50
+ "labels": torch.tensor(label)
51
+ }
52
+
53
+ def preprocess_audio(self, audio_path):
54
+ waveform, sample_rate = torchaudio.load(
55
+ audio_path,
56
+ normalize=True, # Convierte a float32
57
+ # num_frames= # TODO: Probar para que no haga falta recortar los audios
58
+ )
59
+ if sample_rate != SAMPLING_RATE: # Resamplear si no es 16kHz
60
+ resampler = torchaudio.transforms.Resample(sample_rate, SAMPLING_RATE)
61
+ waveform = resampler(waveform)
62
+ if waveform.shape[0] > 1: # Si es stereo, convertir a mono
63
+ waveform = waveform.mean(dim=0)
64
+ waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-6) # Sin 1e-6 el accuracy es pésimo!!
65
+ max_length = int(SAMPLING_RATE * MAX_DURATION)
66
+ if waveform.shape[0] > max_length:
67
+ waveform = waveform[:max_length]
68
+ else:
69
+ waveform = torch.nn.functional.pad(waveform, (0, max_length - waveform.shape[0]))
70
+ inputs = FEATURE_EXTRACTOR(
71
+ waveform,
72
+ sampling_rate=SAMPLING_RATE,
73
+ return_tensors="pt",
74
+ # max_length=int(SAMPLING_RATE * MAX_DURATION),
75
+ # truncation=True,
76
+ padding=True,
77
+ )
78
+ return inputs.input_values.squeeze()
79
 
80
  def seed_everything():
 
81
  torch.manual_seed(seed)
82
  torch.cuda.manual_seed(seed)
83
  torch.backends.cudnn.deterministic = True
84
  torch.backends.cudnn.benchmark = False
85
  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16384:8'
86
 
87
+ def build_label_mappings(dataset_path):
88
+ label2id = {}
89
+ id2label = {}
90
+ label_id = 0
91
+ for label_dir in os.listdir(dataset_path):
92
+ if os.path.isdir(os.path.join(dataset_path, label_dir)):
93
+ label2id[label_dir] = label_id
94
+ id2label[label_id] = label_dir
95
+ label_id += 1
96
+ return label2id, id2label
 
 
 
 
 
 
 
 
97
 
98
+ def create_dataloader(dataset_path, test_size=0.2, num_workers=12, shuffle=True, pin_memory=True):
99
+ label2id, id2label = build_label_mappings(dataset_path)
100
+ dataset = AudioDataset(dataset_path, label2id)
101
+ dataset_size = len(dataset)
102
+ indices = list(range(dataset_size))
103
+ random.shuffle(indices)
104
+ split_idx = int(dataset_size * (1 - test_size))
105
+ train_indices = indices[:split_idx]
106
+ test_indices = indices[split_idx:]
107
+ train_dataset = torch.utils.data.Subset(dataset, train_indices)
108
+ test_dataset = torch.utils.data.Subset(dataset, test_indices)
109
+ train_dataloader = DataLoader(
110
+ train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
111
+ )
112
+ test_dataloader = DataLoader(
113
+ test_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
114
+ )
115
+ return train_dataloader, test_dataloader, label2id, id2label
116
 
117
  def load_model(num_labels, label2id, id2label):
118
+ config = HubertConfig.from_pretrained(
119
  MODEL,
120
  num_labels=num_labels,
121
  label2id=label2id,
122
+ id2label=id2label,
123
+ finetuning_task="audio-classification"
124
+ )
125
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
126
+ model = HubertForSequenceClassification.from_pretrained( # TODO: mirar parámetros. Posibles optimizaciones
127
+ MODEL,
128
+ config=config,
129
+ torch_dtype=torch.float32, # No afecta 1ª época, mejor ponerlo
130
  )
131
+ model.to(device)
132
  return model
133
 
134
  def model_params(dataset_path):
135
+ train_dataloader, test_dataloader, label2id, id2label = create_dataloader(dataset_path)
136
+ model = load_model(num_labels=len(id2label), label2id=label2id, id2label=id2label)
137
+ return model, train_dataloader, test_dataloader, id2label
 
 
138
 
139
  def compute_metrics(eval_pred):
140
+ predictions = torch.argmax(torch.tensor(eval_pred.predictions), dim=-1)
141
+ references = torch.tensor(eval_pred.label_ids)
142
+ accuracy = accuracy_score(references, predictions)
143
+ precision, recall, f1, _ = precision_recall_fscore_support(references, predictions, average='weighted')
144
  return {
145
+ "accuracy": accuracy,
146
+ "precision": precision,
147
+ "recall": recall,
148
+ "f1": f1,
149
  }
150
 
151
+ def main(training_args, output_dir, dataset_path):
152
+ seed_everything()
153
+ model, train_dataloader, test_dataloader, _ = model_params(dataset_path)
 
154
  trainer = Trainer(
155
  model=model,
156
  args=training_args,
157
  compute_metrics=compute_metrics,
158
+ train_dataset=train_dataloader.dataset,
159
+ eval_dataset=test_dataloader.dataset,
160
+ callbacks=[TensorBoardCallback(), EarlyStoppingCallback(early_stopping_patience=3)]
161
  )
162
  torch.cuda.empty_cache() # liberar memoria de la GPU
163
  trainer.train() # se pueden modificar los parámetros para continuar el train
164
+ login(token, add_to_git_credential=True)
165
  trainer.push_to_hub(token=token) # Subir modelo a mi cuenta. Necesario para hacer la predicción, no sé por qué.
166
  trainer.save_model(output_dir) # para subir el modelo a Hugging Face. Necesario para hacer la predicción, no sé por qué.
167
  os.makedirs(output_dir, exist_ok=True) # Crear carpeta con el modelo si no existe
168
+ # upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}",folder_path=output_dir, token=token) # subir modelo a organización
169
 
170
  def load_config(model_name):
171
  with open(config_file, 'r') as f:
 
176
  return model_config
177
 
178
  if __name__ == "__main__":
179
+ # config = load_config(clasificador) # PARA CAMBIAR MODELOS
180
+ config = load_config(monitor) # PARA CAMBIAR MODELOS
181
  training_args = config["training_args"]
182
  output_dir = config["output_dir"]
183
  dataset_path = config["dataset_path"]
184
+ main(training_args, output_dir, dataset_path)