import pandas as pd import numpy as np import re from transformers import Trainer import torch from sklearn.model_selection import GroupShuffleSplit from transformers import AutoTokenizer,AutoModelForSequenceClassification from transformers import TrainingArguments from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score def remove_links(review): pattern = r'\bhttps?://\S+' return re.sub(pattern, '', review) df = pd.read_csv('/Users/danfinel/Downloads/Reviews.csv') df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True) df['Text'] = df['Text'].apply(remove_links) splitter_temp = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 42) split_temp = splitter_temp.split(df[:100000], groups=df[:100000]['ProductId']) train_inds, temp_inds = next(split_temp) train = df.iloc[train_inds] temp = df.iloc[temp_inds] splitter_val = GroupShuffleSplit(test_size=.50, n_splits=1, random_state = 42) split_val = splitter_val.split(temp, groups=temp['ProductId']) val_inds, test_inds = next(split_val) val = temp.iloc[val_inds] test = temp.iloc[test_inds] X_train = train.drop(columns = 'Score') y_train = train.Score X_val = val.drop(columns = 'Score') y_val = val.Score X_test = test.drop(columns = 'Score') y_test = test.Score base_model = 'bert-base-cased' learning_rate = 2e-5 max_length = 64 batch_size = 32 epochs = 5 nbr_samples = 10000 tokenizer_regr = AutoTokenizer.from_pretrained(base_model) model_regr = AutoModelForSequenceClassification.from_pretrained(base_model,num_labels = 1) X_train_bert = X_train.iloc[:nbr_samples] del X_train_bert['ProductId'] X_train_bert['label'] = y_train.iloc[:nbr_samples].astype(float) X_val_bert = X_val.iloc[:nbr_samples] del X_val_bert['ProductId'] X_val_bert['label'] = y_val.iloc[:nbr_samples].astype(float) from datasets import Dataset ds_train_regr = Dataset.from_pandas(X_train_bert) ds_val_regr = Dataset.from_pandas(X_val_bert) def preprocess_function_regr(examples): return tokenizer_regr(examples["Text"], truncation=True, max_length=64, padding = 'max_length') ds_train_regr_tok = ds_train_regr.map(preprocess_function_regr, remove_columns = ['Text']) ds_val_regr_tok = ds_val_regr.map(preprocess_function_regr, remove_columns = ['Text']) def compute_metrics_for_regression(eval_pred): logits, labels = eval_pred labels = labels.reshape(-1, 1) mse = mean_squared_error(labels, logits) mae = mean_absolute_error(labels, logits) r2 = r2_score(labels, logits) single_squared_errors = ((logits - labels).flatten()**2).tolist() accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors) return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy} output_dir = ".." training_args = TrainingArguments( output_dir = output_dir, learning_rate=learning_rate, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=epochs, evaluation_strategy="epoch", save_strategy="epoch", metric_for_best_model="accuracy", load_best_model_at_end=True, weight_decay=0.01, ) class RegressionTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False): labels = inputs.pop("labels") outputs = model(**inputs) logits = outputs[0][:, 0] loss = torch.nn.functional.mse_loss(logits, labels) return (loss, outputs) if return_outputs else loss trainer = Trainer( model=model_regr, args=training_args, train_dataset=ds_train_regr_tok, eval_dataset=ds_val_regr_tok, compute_metrics=compute_metrics_for_regression ) trainer.train() tokenizer_regr.save_pretrained('.') model_regr.save_pretrained('.', from_pt = True)