Spaces:

julianzrmrz
/

dimemex

Running

dimemex / src /nlp_utils.py

Deploy: Versión inicial con modelos v4

4e07023 8 days ago

1.64 kB

	# src/nlp_utils.py
	import torch.nn as nn
	from torch.utils.data import Dataset
	import torch
	from transformers import AutoModel

	class BetoClassifier(nn.Module):
	def __init__(self, n_classes, model_name="dccuchile/bert-base-spanish-wwm-cased"):
	super(BetoClassifier, self).__init__()
	self.bert = AutoModel.from_pretrained(model_name)
	self.drop = nn.Dropout(p=0.3)
	self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

	def forward(self, input_ids, attention_mask):
	output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
	return self.out(self.drop(output.pooler_output))

	class MemeDataset(Dataset):
	def __init__(self, df, tokenizer, max_len, label_col):
	self.df = df.reset_index(drop=True)
	self.tokenizer = tokenizer
	self.max_len = max_len
	self.label_col = label_col # Columna dinamica segun la tarea

	def __len__(self):
	return len(self.df)

	def __getitem__(self, index):
	text = str(self.df.loc[index, 'text_clean'])
	label = int(self.df.loc[index, self.label_col])

	encoding = self.tokenizer.encode_plus(
	text,
	add_special_tokens=True,
	max_length=self.max_len,
	padding='max_length',
	truncation=True,
	return_attention_mask=True,
	return_tensors='pt',
	)

	return {
	'input_ids': encoding['input_ids'].flatten(),
	'attention_mask': encoding['attention_mask'].flatten(),
	'label': torch.tensor(label, dtype=torch.long)
	}