BERTIN-roBERTa-base-Spanish_SemEval18_Emodetection
This is a BERTIN-roBERTa-base-Spanish model trained on ~3500 tweets in Spanish annotated for 11 emotion categories in SemEval-2018 Task 1: Affect in Tweets: SubTask 5: Emotion Classification.
Run the classifier on the test set of the competition:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
import torch
import pandas as pd
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained("bertin-project/bertin-roberta-base-spanish",model_max_length=512)
class RobertaClass(torch.nn.Module):
def __init__(self):
super(RobertaClass, self).__init__()
self.l1 = AutoModel.from_pretrained("bertin-project/bertin-roberta-base-spanish",return_dict=False)
self.l2 = torch.nn.Dropout(0.3)
self.l3 = torch.nn.Linear(768, 11)
def forward(self, input_ids, attention_mask):
_, output_1= self.l1(input_ids=input_ids, attention_mask=attention_mask)
output_2 = self.l2(output_1)
output = self.l3(output_2)
return output
model_name="bertin-roberta-base-spanish_semeval18_emodetection/pytorch_model.bin"
model=RobertaClass()
model.load_state_dict(torch.load(model_name,map_location=torch.device(device)))
model.eval()
model = torch.nn.DataParallel(model)
model.to(device)
twnames=['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']
testset_raw = load_dataset('sem_eval_2018_task_1','subtask5.spanish',split='test')
testset=testset_raw.remove_columns(twnames+["ID"])
testset_tokenized = testset.map(lambda e: tokenizer(e['Tweet'], truncation=True, padding='max_length'), batched=True)
testset_tokenized=testset_tokenized.remove_columns("Tweet")
testset_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])
outfile="predicted_2018-E-c-Es-test-gold.txt"
MAX_LEN = 512
VALID_BATCH_SIZE = 8
inference_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': False,
}
inference_loader = DataLoader(testset_tokenized, **inference_params)
open(outfile,"w").close()
with torch.no_grad():
for _, data in enumerate(inference_loader, 0):
outputs = model(input_ids=data['input_ids'],attention_mask=data['attention_mask'])
fin_outputs=torch.sigmoid(outputs).cpu().detach().numpy().tolist()
pd.DataFrame(fin_outputs).to_csv(outfile,index=False,header=False,sep="\t",mode='a')