annabellatian commited on
Commit
7491ff5
1 Parent(s): 68ecba8

Upload eval_pipeline.py

Browse files
Files changed (1) hide show
  1. eval_pipeline.py +159 -76
eval_pipeline.py CHANGED
@@ -1,86 +1,169 @@
1
  import pandas as pd
2
  from sklearn.model_selection import train_test_split
3
- from google.colab import drive
4
  import torch
5
  from torch.utils.data import Dataset, DataLoader
6
  from transformers import BertTokenizer, BertForSequenceClassification, AdamW
7
- from sklearn.metrics import accuracy_score, classification_report
 
 
8
 
9
- dataset_path = ""
10
  model_path = ""
11
-
12
- news_df = pd.read_csv(dataset_path)
13
-
14
- X = news_df['title']
15
- y = news_df['labels']
16
-
17
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
18
- X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2
19
-
20
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
21
-
22
-
23
- def tokenize_data(texts, tokenizer, max_len=128):
24
- return tokenizer(
25
- list(texts),
26
- padding=True,
27
- truncation=True,
28
- max_length=max_len,
29
- return_tensors="pt"
30
- )
31
-
32
- # Tokenize the training and test datasets
33
- train_encodings = tokenize_data(X_train, tokenizer)
34
- test_encodings = tokenize_data(X_test, tokenizer)
35
-
36
- # Create a custom Dataset class
37
- class NewsDataset(Dataset):
38
- def __init__(self, encodings, labels):
39
- self.encodings = encodings
40
- self.labels = labels
41
-
42
- def __len__(self):
43
- return len(self.labels)
44
-
45
- def __getitem__(self, idx):
46
- item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
47
- item['labels'] = torch.tensor(self.labels[idx])
48
- return item
49
-
50
- train_dataset = NewsDataset(train_encodings, y_train.tolist())
51
- test_dataset = NewsDataset(test_encodings, y_test.tolist())
52
-
53
- # Load DataLoader for batching
54
- train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
55
- test_loader = DataLoader(test_dataset, batch_size=16)
56
-
57
- model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
58
- model.load_state_dict(torch.load(model_path))
59
-
60
- device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
61
- model.to(device)
62
-
63
- # Define optimizer and scheduler
64
- # optimizer = AdamW(model.parameters(), lr=5e-5)
65
- # num_training_steps = len(train_loader) * 4 # Assume 4 epochs
66
- # lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
67
-
68
- # Evaluate the model
69
- def evaluate_model(model, test_loader):
70
- model.eval()
71
- y_true, y_pred = [], []
72
- with torch.no_grad():
73
- for batch in test_loader:
74
- batch = {k: v.to(device) for k, v in batch.items()}
75
- outputs = model(**batch)
76
- logits = outputs.logits
77
- predictions = torch.argmax(logits, dim=-1)
78
- y_true.extend(batch['labels'].tolist())
79
- y_pred.extend(predictions.tolist())
80
- return y_true, y_pred
81
-
82
- y_true, y_pred = evaluate_model(model, test_loader)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  # Print evaluation metrics
85
  print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
86
- print("Classification Report:\n", classification_report(y_true, y_pred))
 
 
1
  import pandas as pd
2
  from sklearn.model_selection import train_test_split
3
+ from sklearn.metrics import accuracy_score, classification_report
4
  import torch
5
  from torch.utils.data import Dataset, DataLoader
6
  from transformers import BertTokenizer, BertForSequenceClassification, AdamW
7
+ from transformers import get_scheduler
8
+ # from google.colab import drive
9
+ from datasets import load_dataset
10
 
11
+ data_path = ""
12
  model_path = ""
13
+ data_files = {"train": "train_data.csv", "validation": "val_data.csv", "test": "test_data.csv"}
14
+
15
+ dataset_train = load_dataset(data_path, data_files=data_files, split="train")
16
+ dataset_val = load_dataset(data_path, data_files=data_files, split="validation")
17
+ dataset_test = load_dataset(data_path, data_files=data_files, split="test")
18
+
19
+ train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True)
20
+ test_loader = DataLoader(dataset_test, batch_size=16)
21
+
22
+ class CustomModel:
23
+ def __init__(self, model_name="bert-base-uncased", num_labels=2, lr=5e-5, epochs=4, max_len=128):
24
+ """
25
+ Initialize the custom model with tokenizer, optimizer, scheduler, and training parameters.
26
+
27
+ Args:
28
+ model_name (str): Name of the pretrained BERT model.
29
+ num_labels (int): Number of labels for the classification task.
30
+ lr (float): Learning rate for the optimizer.
31
+ epochs (int): Number of epochs for training.
32
+ max_len (int): Maximum token length for sequences.
33
+ """
34
+ self.model_name = model_name
35
+ self.num_labels = num_labels
36
+ self.epochs = epochs
37
+ self.max_len = max_len
38
+
39
+ # Load tokenizer and model
40
+ self.tokenizer = BertTokenizer.from_pretrained(model_name)
41
+ self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
42
+
43
+ # Define optimizer
44
+ self.optimizer = AdamW(self.model.parameters(), lr=lr)
45
+
46
+ # Scheduler placeholder
47
+ self.scheduler = None
48
+
49
+ # Device setup
50
+ self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
51
+ self.model.to(self.device)
52
+
53
+ def setup_scheduler(self, train_loader):
54
+ """
55
+ Setup a learning rate scheduler based on training data.
56
+
57
+ Args:
58
+ train_loader (DataLoader): Training data loader.
59
+ """
60
+ num_training_steps = len(train_loader) * self.epochs
61
+ self.scheduler = get_scheduler(
62
+ "linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
63
+ )
64
+
65
+ def tokenize_batch(self, texts):
66
+ """
67
+ Tokenize a batch of text inputs.
68
+
69
+ Args:
70
+ texts (list[str]): List of text strings to tokenize.
71
+
72
+ Returns:
73
+ dict: Tokenized inputs with attention masks and input IDs.
74
+ """
75
+ return self.tokenizer(
76
+ texts,
77
+ padding=True,
78
+ truncation=True,
79
+ max_length=self.max_len,
80
+ return_tensors="pt"
81
+ )
82
+
83
+ def train(self, train_loader):
84
+ """
85
+ Train the model with raw text inputs and labels.
86
+
87
+ Args:
88
+ train_loader (DataLoader): Training data loader containing text and labels.
89
+ """
90
+ self.model.train()
91
+ for epoch in range(self.epochs):
92
+ epoch_loss = 0
93
+ for batch in train_loader:
94
+ texts, labels = batch['title'], batch['labels'] # Assuming each batch is (texts, labels)
95
+ labels = labels.to(self.device)
96
+
97
+ # Tokenize the batch
98
+ tokenized_inputs = self.tokenize_batch(texts)
99
+ tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()}
100
+ tokenized_inputs['labels'] = labels
101
+
102
+ # Forward pass and optimization
103
+ outputs = self.model(**tokenized_inputs)
104
+ loss = outputs.loss
105
+ loss.backward()
106
+ self.optimizer.step()
107
+ self.scheduler.step()
108
+ self.optimizer.zero_grad()
109
+ epoch_loss += loss.item()
110
+ print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {epoch_loss / len(train_loader):.4f}")
111
+
112
+ def evaluate(self, test_loader):
113
+ """
114
+ Evaluate the model with raw text inputs and labels.
115
+
116
+ Args:
117
+ test_loader (DataLoader): Test data loader containing text and labels.
118
+
119
+ Returns:
120
+ Tuple: True labels and predicted labels.
121
+ """
122
+ self.model.eval()
123
+ y_true, y_pred = [], []
124
+ with torch.no_grad():
125
+ for batch in test_loader:
126
+ texts, labels = batch['title'], batch['labels'] # Assuming each batch is (texts, labels)
127
+ labels = labels.to(self.device)
128
+
129
+ # Tokenize the batch
130
+ tokenized_inputs = self.tokenize_batch(texts)
131
+ tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()}
132
+
133
+ # Forward pass
134
+ outputs = self.model(**tokenized_inputs)
135
+ logits = outputs.logits
136
+ predictions = torch.argmax(logits, dim=-1)
137
+ y_true.extend(labels.tolist())
138
+ y_pred.extend(predictions.tolist())
139
+ return y_true, y_pred
140
+
141
+ def save_model(self, save_path):
142
+ """
143
+ Save the model locally in Hugging Face format.
144
+
145
+ Args:
146
+ save_path (str): Path to save the model.
147
+ """
148
+ self.model.save_pretrained(save_path)
149
+ self.tokenizer.save_pretrained(save_path)
150
+
151
+ def push_model(self, repo_name):
152
+ """
153
+ Push the model to the Hugging Face Hub.
154
+
155
+ Args:
156
+ repo_name (str): Repository name on Hugging Face Hub.
157
+ """
158
+ self.model.push_to_hub(repo_name)
159
+ self.tokenizer.push_to_hub(repo_name)
160
+
161
+ custom_model = CustomModel(model_name=model_path, num_labels=2, lr=5e-5, epochs=4)
162
+ # custom_model.setup_scheduler(train_loader)
163
+ # custom_model.train(train_loader)
164
+ y_true, y_pred = custom_model.evaluate(test_loader)
165
 
166
  # Print evaluation metrics
167
  print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
168
+ print("Classification Report:\n", classification_report(y_true, y_pred))
169
+