Spaces:
Runtime error
Runtime error
import os | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
os.environ['WANDB_DISABLED'] = "true" | |
import pandas as pd | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.model_selection import train_test_split | |
from transformers import ( | |
AutoTokenizer, | |
DataCollatorWithPadding, | |
TrainingArguments, | |
Trainer, | |
AutoModelForSequenceClassification | |
) | |
from datasets import Dataset | |
####################################### | |
########## FinBERT training ########### | |
####################################### | |
class args: | |
model = 'ProsusAI/finbert' | |
df = pd.read_csv('all-data.csv', | |
names = ['labels','messages'], | |
encoding='ISO-8859-1') | |
df = df[['messages', 'labels']] | |
le = LabelEncoder() | |
df['labels'] = le.fit_transform(df['labels']) | |
X, y = df['messages'].values, df['labels'].values | |
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.1) | |
xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size=0.2) | |
train_dataset_raw = Dataset.from_dict({'text':xtrain, 'labels':ytrain}) | |
valid_dataset_raw = Dataset.from_dict({'text':xvalid, 'labels':yvalid}) | |
tokenizer = AutoTokenizer.from_pretrained(args.model) | |
def tokenize_fn(examples): | |
return tokenizer(examples['text'], truncation=True) | |
train_dataset = train_dataset_raw.map(tokenize_fn, batched=True) | |
valid_dataset = valid_dataset_raw.map(tokenize_fn, batched=True) | |
data_collator = DataCollatorWithPadding(tokenizer) | |
model = AutoModelForSequenceClassification.from_pretrained(args.model) | |
train_args = TrainingArguments( | |
'./Finbert Trained/', | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=2*16, | |
num_train_epochs=5, | |
learning_rate=2e-5, | |
weight_decay=0.01, | |
warmup_ratio=0.1, | |
do_eval=True, | |
do_train=True, | |
do_predict=True, | |
evaluation_strategy='epoch', | |
save_strategy="no", | |
) | |
trainer = Trainer( | |
model, | |
train_args, | |
train_dataset=train_dataset, | |
eval_dataset=valid_dataset, | |
data_collator=data_collator, | |
tokenizer=tokenizer | |
) | |
trainer.train() | |
# saving the model and the weights | |
model.save_pretrained('fine_tuned_FinBERT') | |
# saving the tokenizer | |
tokenizer.save_pretrained("fine_tuned_FinBERT/tokenizer/") | |