File size: 3,796 Bytes
21cb43a
 
 
f2cea8e
 
21cb43a
f2cea8e
 
514c521
 
 
f2cea8e
21cb43a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2cea8e
21cb43a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514c521
21cb43a
 
 
 
 
 
 
 
 
 
 
 
 
f2cea8e
21cb43a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2cea8e
21cb43a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pandas as pd
import numpy as np
import re
from transformers import Trainer
import torch
from sklearn.model_selection import GroupShuffleSplit
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from transformers import TrainingArguments
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def remove_links(review):
    pattern = r'\bhttps?://\S+'
    return re.sub(pattern, '', review)

df = pd.read_csv('/Users/danfinel/Downloads/Reviews.csv')
df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
df['Text'] = df['Text'].apply(remove_links)


splitter_temp = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 42)
split_temp = splitter_temp.split(df[:100000], groups=df[:100000]['ProductId'])
train_inds, temp_inds = next(split_temp)

train = df.iloc[train_inds]
temp = df.iloc[temp_inds]

splitter_val = GroupShuffleSplit(test_size=.50, n_splits=1, random_state = 42)
split_val = splitter_val.split(temp, groups=temp['ProductId'])
val_inds, test_inds = next(split_val)

val = temp.iloc[val_inds]
test = temp.iloc[test_inds]

X_train = train.drop(columns = 'Score')
y_train = train.Score

X_val = val.drop(columns = 'Score')
y_val = val.Score

X_test = test.drop(columns = 'Score')
y_test = test.Score


base_model = 'bert-base-cased'
learning_rate = 2e-5
max_length = 64
batch_size = 32
epochs = 5
nbr_samples = 10000
tokenizer_regr = AutoTokenizer.from_pretrained(base_model)
model_regr = AutoModelForSequenceClassification.from_pretrained(base_model,num_labels = 1)

X_train_bert = X_train.iloc[:nbr_samples]
del X_train_bert['ProductId']
X_train_bert['label'] = y_train.iloc[:nbr_samples].astype(float)

X_val_bert = X_val.iloc[:nbr_samples]
del X_val_bert['ProductId']
X_val_bert['label'] = y_val.iloc[:nbr_samples].astype(float)

from datasets import Dataset
ds_train_regr = Dataset.from_pandas(X_train_bert)
ds_val_regr = Dataset.from_pandas(X_val_bert)

def preprocess_function_regr(examples):
    return tokenizer_regr(examples["Text"], truncation=True, max_length=64, padding = 'max_length')

ds_train_regr_tok = ds_train_regr.map(preprocess_function_regr, remove_columns = ['Text'])
ds_val_regr_tok = ds_val_regr.map(preprocess_function_regr, remove_columns = ['Text'])



def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)

    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}



output_dir = ".."

training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

trainer = Trainer(
    model=model_regr,
    args=training_args,
    train_dataset=ds_train_regr_tok,
    eval_dataset=ds_val_regr_tok,
    compute_metrics=compute_metrics_for_regression
)

trainer.train()

tokenizer_regr.save_pretrained('.')
model_regr.save_pretrained('.', from_pt = True)