Spaces:
Running
Running
| # src/nlp_utils.py | |
| import torch.nn as nn | |
| from torch.utils.data import Dataset | |
| import torch | |
| from transformers import AutoModel | |
| class BetoClassifier(nn.Module): | |
| def __init__(self, n_classes, model_name="dccuchile/bert-base-spanish-wwm-cased"): | |
| super(BetoClassifier, self).__init__() | |
| self.bert = AutoModel.from_pretrained(model_name) | |
| self.drop = nn.Dropout(p=0.3) | |
| self.out = nn.Linear(self.bert.config.hidden_size, n_classes) | |
| def forward(self, input_ids, attention_mask): | |
| output = self.bert(input_ids=input_ids, attention_mask=attention_mask) | |
| return self.out(self.drop(output.pooler_output)) | |
| class MemeDataset(Dataset): | |
| def __init__(self, df, tokenizer, max_len, label_col): | |
| self.df = df.reset_index(drop=True) | |
| self.tokenizer = tokenizer | |
| self.max_len = max_len | |
| self.label_col = label_col # Columna dinamica segun la tarea | |
| def __len__(self): | |
| return len(self.df) | |
| def __getitem__(self, index): | |
| text = str(self.df.loc[index, 'text_clean']) | |
| label = int(self.df.loc[index, self.label_col]) | |
| encoding = self.tokenizer.encode_plus( | |
| text, | |
| add_special_tokens=True, | |
| max_length=self.max_len, | |
| padding='max_length', | |
| truncation=True, | |
| return_attention_mask=True, | |
| return_tensors='pt', | |
| ) | |
| return { | |
| 'input_ids': encoding['input_ids'].flatten(), | |
| 'attention_mask': encoding['attention_mask'].flatten(), | |
| 'label': torch.tensor(label, dtype=torch.long) | |
| } |