deployment_final_project / training_bert.py
dfinel's picture
Update training_bert.py
514c521 verified
import pandas as pd
import numpy as np
import re
from transformers import Trainer
import torch
from sklearn.model_selection import GroupShuffleSplit
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from transformers import TrainingArguments
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
def remove_links(review):
pattern = r'\bhttps?://\S+'
return re.sub(pattern, '', review)
df = pd.read_csv('/Users/danfinel/Downloads/Reviews.csv')
df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
df['Text'] = df['Text'].apply(remove_links)
splitter_temp = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 42)
split_temp = splitter_temp.split(df[:100000], groups=df[:100000]['ProductId'])
train_inds, temp_inds = next(split_temp)
train = df.iloc[train_inds]
temp = df.iloc[temp_inds]
splitter_val = GroupShuffleSplit(test_size=.50, n_splits=1, random_state = 42)
split_val = splitter_val.split(temp, groups=temp['ProductId'])
val_inds, test_inds = next(split_val)
val = temp.iloc[val_inds]
test = temp.iloc[test_inds]
X_train = train.drop(columns = 'Score')
y_train = train.Score
X_val = val.drop(columns = 'Score')
y_val = val.Score
X_test = test.drop(columns = 'Score')
y_test = test.Score
base_model = 'bert-base-cased'
learning_rate = 2e-5
max_length = 64
batch_size = 32
epochs = 5
nbr_samples = 10000
tokenizer_regr = AutoTokenizer.from_pretrained(base_model)
model_regr = AutoModelForSequenceClassification.from_pretrained(base_model,num_labels = 1)
X_train_bert = X_train.iloc[:nbr_samples]
del X_train_bert['ProductId']
X_train_bert['label'] = y_train.iloc[:nbr_samples].astype(float)
X_val_bert = X_val.iloc[:nbr_samples]
del X_val_bert['ProductId']
X_val_bert['label'] = y_val.iloc[:nbr_samples].astype(float)
from datasets import Dataset
ds_train_regr = Dataset.from_pandas(X_train_bert)
ds_val_regr = Dataset.from_pandas(X_val_bert)
def preprocess_function_regr(examples):
return tokenizer_regr(examples["Text"], truncation=True, max_length=64, padding = 'max_length')
ds_train_regr_tok = ds_train_regr.map(preprocess_function_regr, remove_columns = ['Text'])
ds_val_regr_tok = ds_val_regr.map(preprocess_function_regr, remove_columns = ['Text'])
def compute_metrics_for_regression(eval_pred):
logits, labels = eval_pred
labels = labels.reshape(-1, 1)
mse = mean_squared_error(labels, logits)
mae = mean_absolute_error(labels, logits)
r2 = r2_score(labels, logits)
single_squared_errors = ((logits - labels).flatten()**2).tolist()
accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}
output_dir = ".."
training_args = TrainingArguments(
output_dir = output_dir,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=epochs,
evaluation_strategy="epoch",
save_strategy="epoch",
metric_for_best_model="accuracy",
load_best_model_at_end=True,
weight_decay=0.01,
)
class RegressionTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop("labels")
outputs = model(**inputs)
logits = outputs[0][:, 0]
loss = torch.nn.functional.mse_loss(logits, labels)
return (loss, outputs) if return_outputs else loss
trainer = Trainer(
model=model_regr,
args=training_args,
train_dataset=ds_train_regr_tok,
eval_dataset=ds_val_regr_tok,
compute_metrics=compute_metrics_for_regression
)
trainer.train()
tokenizer_regr.save_pretrained('.')
model_regr.save_pretrained('.', from_pt = True)