andyqin18 commited on
Commit
90ae92b
1 Parent(s): 9b3af2e

Created a notebook

Browse files
milestone3/.ipynb_checkpoints/finetune_notebook-checkpoint.ipynb ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "80baea1a",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "# 1 Prepate dataset\n",
11
+ "# 2 Load pretrained Tokenizer, call it with dataset -> encoding\n",
12
+ "# 3 Build PyTorch Dataset with encodings\n",
13
+ "# 4 Load pretrained model\n",
14
+ "# 5 a) Load Trainer and train it\n",
15
+ "# b) or use native Pytorch training pipeline\n",
16
+ "from pathlib import Path\n",
17
+ "from sklearn.model_selection import train_test_split\n",
18
+ "import torch\n",
19
+ "from torch.utils.data import Dataset\n",
20
+ "from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification\n",
21
+ "from transformers import Trainer, TrainingArguments\n",
22
+ "\n",
23
+ "model_name = \"distilbert-base-uncased\"\n",
24
+ "\n",
25
+ "def read_imdb_split(split_dir): # helper function to get text and label\n",
26
+ " split_dir = Path(split_dir)\n",
27
+ " texts = []\n",
28
+ " labels = []\n",
29
+ " for label_dir in [\"pos\", \"neg\"]:\n",
30
+ " thres = 0\n",
31
+ " for text_file in (split_dir/label_dir).iterdir():\n",
32
+ " if thres < 100:\n",
33
+ " f = open(text_file, encoding='utf8')\n",
34
+ " texts.append(f.read())\n",
35
+ " labels.append(0 if label_dir == \"neg\" else 1)\n",
36
+ " thres += 1\n",
37
+ "\n",
38
+ " return texts, labels\n",
39
+ "\n",
40
+ "train_texts, train_labels = read_imdb_split(\"aclImdb/train\")\n",
41
+ "test_texts, test_labels = read_imdb_split(\"aclImdb/test\")\n",
42
+ "\n",
43
+ "train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)\n",
44
+ "\n",
45
+ "\n",
46
+ "class IMDBDataset(Dataset):\n",
47
+ " def __init__(self, encodings, labels):\n",
48
+ " self.encodings = encodings\n",
49
+ " self.labels = labels\n",
50
+ "\n",
51
+ " def __getitem__(self, idx):\n",
52
+ " item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
53
+ " item[\"labels\"] = torch.tensor(self.labels[idx])\n",
54
+ " return item\n",
55
+ " \n",
56
+ " def __len__(self):\n",
57
+ " return len(self.labels)\n",
58
+ " \n",
59
+ "tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)\n",
60
+ "\n",
61
+ "train_encodings = tokenizer(train_texts, truncation=True, padding=True)\n",
62
+ "val_encodings = tokenizer(val_texts, truncation=True, padding=True)\n",
63
+ "test_encodings = tokenizer(test_texts, truncation=True, padding=True)\n",
64
+ "\n",
65
+ "train_dataset = IMDBDataset(train_encodings, train_labels)\n",
66
+ "val_dataset = IMDBDataset(val_encodings, val_labels)\n",
67
+ "test_dataset = IMDBDataset(test_encodings, test_labels)\n",
68
+ "\n",
69
+ "training_args = TrainingArguments(\n",
70
+ " output_dir='./results',\n",
71
+ " num_train_epochs=2,\n",
72
+ " per_device_train_batch_size=16,\n",
73
+ " per_device_eval_batch_size=64,\n",
74
+ " warmup_steps=500,\n",
75
+ " learning_rate=5e-5,\n",
76
+ " weight_decay=0.01,\n",
77
+ " logging_dir='./logs',\n",
78
+ " logging_steps=10\n",
79
+ ")\n",
80
+ "\n",
81
+ "model = DistilBertForSequenceClassification.from_pretrained(model_name)\n",
82
+ "trainer = Trainer(\n",
83
+ " model=model,\n",
84
+ " args=training_args,\n",
85
+ " train_dataset=train_dataset,\n",
86
+ " eval_dataset=val_dataset\n",
87
+ ")\n",
88
+ "\n",
89
+ "trainer.train() \n",
90
+ "\n",
91
+ "\n",
92
+ "\n"
93
+ ]
94
+ }
95
+ ],
96
+ "metadata": {
97
+ "kernelspec": {
98
+ "display_name": "Python 3 (ipykernel)",
99
+ "language": "python",
100
+ "name": "python3"
101
+ },
102
+ "language_info": {
103
+ "codemirror_mode": {
104
+ "name": "ipython",
105
+ "version": 3
106
+ },
107
+ "file_extension": ".py",
108
+ "mimetype": "text/x-python",
109
+ "name": "python",
110
+ "nbconvert_exporter": "python",
111
+ "pygments_lexer": "ipython3",
112
+ "version": "3.10.6"
113
+ }
114
+ },
115
+ "nbformat": 4,
116
+ "nbformat_minor": 5
117
+ }
milestone3/finetune_notebook.ipynb ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "80baea1a",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "# 1 Prepate dataset\n",
11
+ "# 2 Load pretrained Tokenizer, call it with dataset -> encoding\n",
12
+ "# 3 Build PyTorch Dataset with encodings\n",
13
+ "# 4 Load pretrained model\n",
14
+ "# 5 a) Load Trainer and train it\n",
15
+ "# b) or use native Pytorch training pipeline\n",
16
+ "from pathlib import Path\n",
17
+ "from sklearn.model_selection import train_test_split\n",
18
+ "import torch\n",
19
+ "from torch.utils.data import Dataset\n",
20
+ "from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification\n",
21
+ "from transformers import Trainer, TrainingArguments\n",
22
+ "\n",
23
+ "model_name = \"distilbert-base-uncased\"\n",
24
+ "\n",
25
+ "def read_imdb_split(split_dir): # helper function to get text and label\n",
26
+ " split_dir = Path(split_dir)\n",
27
+ " texts = []\n",
28
+ " labels = []\n",
29
+ " for label_dir in [\"pos\", \"neg\"]:\n",
30
+ " thres = 0\n",
31
+ " for text_file in (split_dir/label_dir).iterdir():\n",
32
+ " if thres < 100:\n",
33
+ " f = open(text_file, encoding='utf8')\n",
34
+ " texts.append(f.read())\n",
35
+ " labels.append(0 if label_dir == \"neg\" else 1)\n",
36
+ " thres += 1\n",
37
+ "\n",
38
+ " return texts, labels\n",
39
+ "\n",
40
+ "train_texts, train_labels = read_imdb_split(\"aclImdb/train\")\n",
41
+ "test_texts, test_labels = read_imdb_split(\"aclImdb/test\")\n",
42
+ "\n",
43
+ "train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)\n",
44
+ "\n",
45
+ "\n",
46
+ "class IMDBDataset(Dataset):\n",
47
+ " def __init__(self, encodings, labels):\n",
48
+ " self.encodings = encodings\n",
49
+ " self.labels = labels\n",
50
+ "\n",
51
+ " def __getitem__(self, idx):\n",
52
+ " item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
53
+ " item[\"labels\"] = torch.tensor(self.labels[idx])\n",
54
+ " return item\n",
55
+ " \n",
56
+ " def __len__(self):\n",
57
+ " return len(self.labels)\n",
58
+ " \n",
59
+ "tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)\n",
60
+ "\n",
61
+ "train_encodings = tokenizer(train_texts, truncation=True, padding=True)\n",
62
+ "val_encodings = tokenizer(val_texts, truncation=True, padding=True)\n",
63
+ "test_encodings = tokenizer(test_texts, truncation=True, padding=True)\n",
64
+ "\n",
65
+ "train_dataset = IMDBDataset(train_encodings, train_labels)\n",
66
+ "val_dataset = IMDBDataset(val_encodings, val_labels)\n",
67
+ "test_dataset = IMDBDataset(test_encodings, test_labels)\n",
68
+ "\n",
69
+ "training_args = TrainingArguments(\n",
70
+ " output_dir='./results',\n",
71
+ " num_train_epochs=2,\n",
72
+ " per_device_train_batch_size=16,\n",
73
+ " per_device_eval_batch_size=64,\n",
74
+ " warmup_steps=500,\n",
75
+ " learning_rate=5e-5,\n",
76
+ " weight_decay=0.01,\n",
77
+ " logging_dir='./logs',\n",
78
+ " logging_steps=10\n",
79
+ ")\n",
80
+ "\n",
81
+ "model = DistilBertForSequenceClassification.from_pretrained(model_name)\n",
82
+ "trainer = Trainer(\n",
83
+ " model=model,\n",
84
+ " args=training_args,\n",
85
+ " train_dataset=train_dataset,\n",
86
+ " eval_dataset=val_dataset\n",
87
+ ")\n",
88
+ "\n",
89
+ "trainer.train() \n",
90
+ "\n",
91
+ "\n",
92
+ "\n"
93
+ ]
94
+ }
95
+ ],
96
+ "metadata": {
97
+ "kernelspec": {
98
+ "display_name": "Python 3 (ipykernel)",
99
+ "language": "python",
100
+ "name": "python3"
101
+ },
102
+ "language_info": {
103
+ "codemirror_mode": {
104
+ "name": "ipython",
105
+ "version": 3
106
+ },
107
+ "file_extension": ".py",
108
+ "mimetype": "text/x-python",
109
+ "name": "python",
110
+ "nbconvert_exporter": "python",
111
+ "pygments_lexer": "ipython3",
112
+ "version": "3.10.6"
113
+ }
114
+ },
115
+ "nbformat": 4,
116
+ "nbformat_minor": 5
117
+ }