makiisthebes commited on
Commit
e8bc872
1 Parent(s): b4b7e8a

Model, parameters and utils for evaluating.

Browse files
Files changed (3) hide show
  1. current_best_acc.pt +3 -0
  2. main.py +267 -0
  3. my_utils.py +1111 -0
current_best_acc.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0481c84e636c920f346d01cf25dc191ca8d23e0b0944e87a83197b0389dffe68
3
+ size 6185383
main.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %matplotlib inline
2
+ # optuna-dashboard sqlite:///db.sqlite3
3
+ # https://github.com/optuna/optuna-dashboard
4
+
5
+ import optuna # Used for the hyperparameter tuning, because cba anymore to do in any other way.
6
+ import gc
7
+ import torch
8
+ from torch.utils import data
9
+ import torchvision.datasets as datasets
10
+ from torchvision import transforms
11
+ from torch import nn
12
+ import random
13
+ import my_utils as mu
14
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
15
+
16
+
17
+
18
+ train_trans = transforms.Compose([
19
+ # add transformations, and data augmentation, to increase batch size and increase generalization.
20
+ # transforms.FiveCrop(size=(32,32)), # might remove this.
21
+ transforms.RandomPerspective(distortion_scale=0.6, p=0.4),
22
+ transforms.GaussianBlur(kernel_size=(5, 11), sigma=(0.1, 0.2)),
23
+ transforms.RandomRotation(degrees=(-8, 8)),
24
+ transforms.ToTensor(),
25
+ # transforms.Normalize((0.49139968, 0.48215827 ,0.44653124), (0.24703233, 0.24348505, 0.26158768))
26
+ ])
27
+
28
+ test_trans = transforms.Compose([
29
+ # This is all we need for the normalization of the model.
30
+ transforms.ToTensor(),
31
+ # transforms.Normalize((0.49139968, 0.48215827, 0.44653124), (0.24703233, 0.24348505, 0.26158768))
32
+ ])
33
+
34
+
35
+ # Required for data to be in Tensor form not PIL Image
36
+ trans = [transforms.ToTensor()]
37
+ trans = transforms.Compose(trans)
38
+
39
+ cifar_trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_trans)
40
+ cifar_testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_trans)
41
+ # 60,000 32x32 color images in 10 different classes
42
+
43
+ # batch_size = 15
44
+ # data_iter = data.DataLoader(cifar_trainset, batch_size, shuffle=True)
45
+ # test_iter = data.DataLoader(cifar_testset, batch_size, shuffle=True)
46
+
47
+ print("Read dataset and create dataloaders - 5%")
48
+
49
+
50
+ def SpatialAveragePool(X):
51
+ return torch.mean(X, dim=[2,3])
52
+
53
+
54
+ def init_weights(m):
55
+ if type(m) == nn.Linear or type(m) == nn.Conv2d:
56
+ torch.nn.init.xavier_uniform_(m.weight)
57
+
58
+
59
+ loss = nn.CrossEntropyLoss()
60
+
61
+
62
+ class MakiNet(nn.Module):
63
+ def __init__(self, conv_arch, num_classes, dropout_rate=0.0001): # conv_arch:
64
+ super(MakiNet, self).__init__()
65
+ self.out_classes = num_classes
66
+ self.conv_arch = conv_arch
67
+ k= 0
68
+ for i, (num_conv, in_channels, out_channels) in enumerate(conv_arch):
69
+ self.add_module(f"maki_block{i}", MakiBlock(num_conv, in_channels, out_channels, dropout_rate))
70
+ # input_channels = out_channels
71
+ # k = out_channels * (32-(2*len(conv_arch))) * (32-(2*len(conv_arch)))
72
+ k = out_channels
73
+ print(str(k) + " parameters")
74
+
75
+ self.classifier = nn.Sequential(
76
+ nn.Flatten(),
77
+ nn.Linear(k, 75),
78
+ nn.Dropout(p=dropout_rate),
79
+ nn.ReLU(),
80
+ nn.Linear(75, 25),
81
+ nn.Dropout(p=dropout_rate),
82
+ nn.ReLU(),
83
+ nn.Linear(25, num_classes)
84
+ )
85
+
86
+ def forward(self, x):
87
+ out = x
88
+ # print(f"number of blocks: {len(self.conv_arch)}")
89
+ for i in range(len(self.conv_arch)):
90
+ out = self._modules[f"maki_block{i}"](out)
91
+ out = SpatialAveragePool(out)
92
+ out = self.classifier(out)
93
+ return out
94
+
95
+
96
+ class MakiBlock(nn.Module):
97
+ def __init__(self, num_conv, input_channels, output_channels, dropout_rate):
98
+ super(MakiBlock, self).__init__()
99
+ self.num_convs = num_conv
100
+ self.linear = nn.Linear(input_channels, num_conv, bias=False)
101
+ self.relu = nn.ReLU()
102
+ #self.max = nn.MaxPool2d(kernel_size=3, padding=1, stride=1)
103
+ #self.avg = nn.AvgPool2d(kernel_size=3, padding=1, stride=1)
104
+ self.dropout = nn.Dropout(p=dropout_rate)
105
+
106
+ for i in range(num_conv):
107
+ # add convolution layer
108
+ self.add_module(f"conv{i}", nn.Conv2d(input_channels, output_channels, kernel_size=5, padding=1, stride=1))
109
+ # add batch norm layer
110
+ self.add_module(f"batch_norm{i}", nn.BatchNorm2d(output_channels))
111
+
112
+
113
+ def forward(self, x):
114
+ # apply the linear model x, to a number of outputs.
115
+ x = x.to(device)
116
+ # Initial MLP part.
117
+ avg_out = SpatialAveragePool(x)
118
+ avg_out = avg_out.to(device)
119
+
120
+ lin_out = self.linear(avg_out)
121
+ lin_out - self.dropout(lin_out)
122
+
123
+ a = self.relu(lin_out) # a vector.
124
+
125
+
126
+ total_output = []
127
+ for j in range(self.num_convs):
128
+
129
+ out = self._modules[f"conv{j}"](x)
130
+ out = self._modules[f"batch_norm{j}"](out)
131
+ out = self.dropout(out)
132
+ out = self.relu(out)
133
+ # out = self.max(out) # removing it, as shown ineffective.
134
+ s = a[:, j].unsqueeze(-1).unsqueeze(-1).unsqueeze(-1) * out
135
+ total_output.append(s)
136
+
137
+ total_output = torch.stack(total_output, dim=0)
138
+ out = torch.sum(total_output, dim=0)
139
+ return torch.Tensor(out)
140
+
141
+ print("Create the model - 40%")
142
+
143
+
144
+ # def train_model(net, train_iter, test_iter, num_epochs, lr, wd=1e-9, device=device, param_dict=None):
145
+ def train_model(trail, study, device=device):
146
+ gc.collect()
147
+ torch.cuda.empty_cache()
148
+ gpu_memory = torch.cuda.memory_allocated(device='cuda:0')
149
+ print(f"GPU memory allocated: {gpu_memory}")
150
+ # To be completed.
151
+ try:
152
+ batch_size = trail.suggest_int("batch_size", 32, 256)
153
+ train_iter = data.DataLoader(cifar_trainset, batch_size, shuffle=True)
154
+ test_iter = data.DataLoader(cifar_testset, batch_size, shuffle=True)
155
+
156
+ # dropout_rate = trail.suggest_float("dropout_rate", 1e-5, 1e-1, log=True)
157
+ dropout_rate = 0.15
158
+
159
+ #number_of_layers = trail.suggest_int("number_of_layers", 3, 8)
160
+ #number_of_channels = trail.suggest_int("number_of_channels", 50, 200)
161
+ # number_of_layers = 4
162
+ # number_of_channels = 10
163
+
164
+ # num_conv = trail.suggest_int(3, 5) # 3, try 12 next after this.
165
+ num_conv = 3
166
+ #model_arch = [
167
+ # [num_conv, 3, number_of_channels], # num_conv, in_channels, out_channels
168
+ #]
169
+ #for i in range(number_of_layers):
170
+ # model_arch.append([num_conv, number_of_channels, number_of_channels])
171
+
172
+ model_arch = [
173
+ [3, 3, 120], # num_conv, in_channels, out_channels
174
+ [3, 120, 100],
175
+ [3, 100, 80],
176
+ ]
177
+ net = MakiNet(model_arch, 10, dropout_rate=dropout_rate)
178
+ net.to(device)
179
+ #state_dict = torch.load(f"optuna_coursework_multi_arch_hyper_maki_net_4_10_0.713.params")
180
+ #net.load_state_dict(state_dict)
181
+ net.apply(init_weights)
182
+
183
+ lr = trail.suggest_float("lr", 1e-5, 9e-1, log=True)
184
+ # wd = trail.suggest_float("wd", 1e-9, 1e-1, log=True)
185
+ wd=0
186
+ optimizer = torch.optim.SGD(net.parameters(), lr=lr, weight_decay=wd)
187
+ loss = nn.CrossEntropyLoss()
188
+ timer = mu.Timer()
189
+
190
+ num_epochs = trail.suggest_int("num_epochs", 20, 50) # 10, 40
191
+
192
+ metric = mu.Accumulator(3) # train_loss, train_acc, num_examples
193
+ train_loss = 0
194
+ train_acc = 0
195
+ for epoch in range(num_epochs):
196
+ print(f"Epoch: {epoch}/ {num_epochs}")
197
+ for i, (X, y) in enumerate(train_iter):
198
+ timer.start()
199
+ net.train()
200
+ optimizer.zero_grad()
201
+ X, y = X.to(device), y.to(device)
202
+ y_hat = net(X)
203
+ l = loss(y_hat, y)
204
+ l.backward()
205
+ optimizer.step()
206
+ with torch.no_grad():
207
+ metric.add(l*X.shape[0], mu.accuracy(y_hat, y), X.shape[0])
208
+ timer.stop()
209
+ train_loss, train_acc = metric[0]/metric[2], metric[1]/metric[2]
210
+ if (i+1) % 50 == 0:
211
+ print(f'batch {i+1}, train loss {train_loss:.3f}, train acc {train_acc:.3f}')
212
+ test_acc = mu.evaluate_accuracy_gpu(net, test_iter)
213
+ print(f'Test Accuracy for epoch {epoch+1} is {test_acc:.3f}')
214
+
215
+ if epoch == 5 and test_acc <= 0.1:
216
+ # # Stop the trial if the test accuracy is less than 0.1 after 10 epochs. To save time.
217
+ raise optuna.exceptions.TrialPruned()
218
+
219
+ test_acc = mu.evaluate_accuracy_gpu(net, test_iter)
220
+ test_acc_delta = test_acc - train_acc
221
+ if test_acc_delta < -0.25:
222
+ # overfitting of more than 25%, prune.
223
+ raise optuna.exceptions.TrialPruned()
224
+
225
+ try:
226
+ if test_acc > study.best_trials[0].values[0]:
227
+ torch.save(net.state_dict(), f"attempt6{test_acc:.3f}.params")
228
+ except IndexError:
229
+ print("No best trial yet.")
230
+ torch.save(net.state_dict(), f"attempt6{test_acc:.3f}.params")
231
+ except Exception as e:
232
+ print("Exception occurred")
233
+ return optuna.exceptions.OptunaError(f"Exception occurred during training. {e}")
234
+ return test_acc # , train_loss, test_acc_delta
235
+
236
+
237
+ if __name__ == "__main__":
238
+
239
+
240
+
241
+ # model_arch = [
242
+ # [6, 3, 12], # num_conv, in_channels, out_channels max 73% no aug
243
+ # [4, 12, 15],
244
+ # [4, 15, 12],
245
+ # [5, 12, 9],
246
+ # ]
247
+ #
248
+ # model_arch2 = [
249
+ # [3, 3, 5], # num_conv, in_channels, out_channels
250
+ # [3, 5, 6],
251
+ # [3, 6, 3],
252
+ # # [2, 6, 3]
253
+ # ]
254
+
255
+ #study = optuna.create_study(study_name="attempt6", storage="sqlite:///db.sqlite3", directions=["maximize"]) # maximise the test accuracy.
256
+ study = optuna.load_study(study_name="attempt6", storage="sqlite:///db.sqlite3")
257
+ # print(f"Study Attributes: {study.user_attrs}")
258
+ #print(f"{study.best_trials[0].values[0]} - {study.best_trials[0].params}")
259
+
260
+ study.optimize(lambda trial: train_model(trial, study, device), n_trials=1000) # 2 hours.
261
+
262
+ # All task runs up to run 25, are with transformer simple model, not with data augmentation.
263
+
264
+ # optuna-dashboard sqlite:///db.sqlite3
265
+
266
+
267
+ # test_acc, train_loss, test_acc_delta
my_utils.py ADDED
@@ -0,0 +1,1111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is generated automatically through:
2
+ # d2lbook build lib
3
+ # Don't edit it directly
4
+
5
+ # Defined in file: ./chapter_preface/index.md
6
+ import collections
7
+ from collections import defaultdict
8
+ from IPython import display
9
+ import math
10
+ from matplotlib import pyplot as plt
11
+ import os
12
+ import pandas as pd
13
+ import random
14
+ import re
15
+ import shutil
16
+ import sys
17
+ import tarfile
18
+ import time
19
+ import requests
20
+ import zipfile
21
+ import hashlib
22
+ d2l = sys.modules[__name__]
23
+
24
+
25
+ # Defined in file: ./chapter_preface/index.md
26
+ import numpy as np
27
+ import torch
28
+ import torchvision
29
+ from torch import nn
30
+ from torch.nn import functional as F
31
+ from torch.utils import data
32
+ from torchvision import transforms
33
+
34
+
35
+ # Defined in file: ./chapter_preliminaries/pandas.md
36
+ def mkdir_if_not_exist(path): #@save
37
+ """Make a directory if it does not exist."""
38
+ if not isinstance(path, str):
39
+ path = os.path.join(*path)
40
+ if not os.path.exists(path):
41
+ os.makedirs(path)
42
+
43
+
44
+ # Defined in file: ./chapter_preliminaries/calculus.md
45
+ def use_svg_display(): #@save
46
+ """Use the svg format to display a plot in Jupyter."""
47
+ display.set_matplotlib_formats('svg')
48
+
49
+
50
+ # Defined in file: ./chapter_preliminaries/calculus.md
51
+ def set_figsize(figsize=(3.5, 2.5)): #@save
52
+ """Set the figure size for matplotlib."""
53
+ use_svg_display()
54
+ d2l.plt.rcParams['figure.figsize'] = figsize
55
+
56
+
57
+ # Defined in file: ./chapter_preliminaries/calculus.md
58
+ def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
59
+ """Set the axes for matplotlib."""
60
+ axes.set_xlabel(xlabel)
61
+ axes.set_ylabel(ylabel)
62
+ axes.set_xscale(xscale)
63
+ axes.set_yscale(yscale)
64
+ axes.set_xlim(xlim)
65
+ axes.set_ylim(ylim)
66
+ if legend:
67
+ axes.legend(legend)
68
+ axes.grid()
69
+
70
+
71
+ # Defined in file: ./chapter_preliminaries/calculus.md
72
+ def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None,
73
+ ylim=None, xscale='linear', yscale='linear',
74
+ fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None):
75
+ """Plot data points."""
76
+ if legend is None:
77
+ legend = []
78
+
79
+ set_figsize(figsize)
80
+ axes = axes if axes else d2l.plt.gca()
81
+
82
+ # Return True if `X` (tensor or list) has 1 axis
83
+ def has_one_axis(X):
84
+ return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list)
85
+ and not hasattr(X[0], "__len__"))
86
+
87
+ if has_one_axis(X):
88
+ X = [X]
89
+ if Y is None:
90
+ X, Y = [[]] * len(X), X
91
+ elif has_one_axis(Y):
92
+ Y = [Y]
93
+ if len(X) != len(Y):
94
+ X = X * len(Y)
95
+ axes.cla()
96
+ for x, y, fmt in zip(X, Y, fmts):
97
+ if len(x):
98
+ axes.plot(x, y, fmt)
99
+ else:
100
+ axes.plot(y, fmt)
101
+ set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
102
+
103
+
104
+ # Defined in file: ./chapter_linear-networks/linear-regression.md
105
+ class Timer: #@save
106
+ """Record multiple running times."""
107
+ def __init__(self):
108
+ self.times = []
109
+ self.start()
110
+
111
+ def start(self):
112
+ """Start the timer."""
113
+ self.tik = time.time()
114
+
115
+ def stop(self):
116
+ """Stop the timer and record the time in a list."""
117
+ self.times.append(time.time() - self.tik)
118
+ return self.times[-1]
119
+
120
+ def avg(self):
121
+ """Return the average time."""
122
+ return sum(self.times) / len(self.times)
123
+
124
+ def sum(self):
125
+ """Return the sum of time."""
126
+ return sum(self.times)
127
+
128
+ def cumsum(self):
129
+ """Return the accumulated time."""
130
+ return np.array(self.times).cumsum().tolist()
131
+
132
+
133
+ # Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
134
+ def synthetic_data(w, b, num_examples): #@save
135
+ """Generate y = Xw + b + noise."""
136
+ X = d2l.normal(0, 1, (num_examples, len(w)))
137
+ y = d2l.matmul(X, w) + b
138
+ y += d2l.normal(0, 0.01, y.shape)
139
+ return X, d2l.reshape(y, (-1, 1))
140
+
141
+
142
+ # Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
143
+ def linreg(X, w, b): #@save
144
+ """The linear regression model."""
145
+ return d2l.matmul(X, w) + b
146
+
147
+
148
+ # Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
149
+ def squared_loss(y_hat, y): #@save
150
+ """Squared loss."""
151
+ return (y_hat - d2l.reshape(y, y_hat.shape)) ** 2 / 2
152
+
153
+
154
+ # Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
155
+ def sgd(params, lr, batch_size): #@save
156
+ """Minibatch stochastic gradient descent."""
157
+ for param in params:
158
+ param.data.sub_(lr*param.grad/batch_size)
159
+ param.grad.data.zero_()
160
+
161
+
162
+ # Defined in file: ./chapter_linear-networks/linear-regression-concise.md
163
+ def load_array(data_arrays, batch_size, is_train=True): #@save
164
+ """Construct a PyTorch data iterator."""
165
+ dataset = data.TensorDataset(*data_arrays)
166
+ return data.DataLoader(dataset, batch_size, shuffle=is_train)
167
+
168
+
169
+ # Defined in file: ./chapter_linear-networks/image-classification-dataset.md
170
+ def get_fashion_mnist_labels(labels): #@save
171
+ """Return text labels for the Fashion-MNIST dataset."""
172
+ text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
173
+ 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
174
+ return [text_labels[int(i)] for i in labels]
175
+
176
+
177
+ # Defined in file: ./chapter_linear-networks/image-classification-dataset.md
178
+ def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5): #@save
179
+ """Plot a list of images."""
180
+ figsize = (num_cols * scale, num_rows * scale)
181
+ _, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize)
182
+ axes = axes.flatten()
183
+ for i, (ax, img) in enumerate(zip(axes, imgs)):
184
+ ax.imshow(d2l.numpy(img))
185
+ ax.axes.get_xaxis().set_visible(False)
186
+ ax.axes.get_yaxis().set_visible(False)
187
+ if titles:
188
+ ax.set_title(titles[i])
189
+ return axes
190
+
191
+
192
+ # Defined in file: ./chapter_linear-networks/image-classification-dataset.md
193
+ def get_dataloader_workers(): #@save
194
+ """Use 4 processes to read the data."""
195
+ return 4
196
+
197
+
198
+ # Defined in file: ./chapter_linear-networks/image-classification-dataset.md
199
+ def load_data_fashion_mnist(batch_size, resize=None): #@save
200
+ """Download the Fashion-MNIST dataset and then load it into memory."""
201
+ trans = [transforms.ToTensor()]
202
+ if resize:
203
+ trans.insert(0, transforms.Resize(resize))
204
+ trans = transforms.Compose(trans)
205
+ mnist_train = torchvision.datasets.FashionMNIST(
206
+ root="../data", train=True, transform=trans, download=True)
207
+ mnist_test = torchvision.datasets.FashionMNIST(
208
+ root="../data", train=False, transform=trans, download=True)
209
+ return (data.DataLoader(mnist_train, batch_size, shuffle=True,
210
+ num_workers=get_dataloader_workers()),
211
+ data.DataLoader(mnist_test, batch_size, shuffle=False,
212
+ num_workers=get_dataloader_workers()))
213
+
214
+
215
+ # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
216
+ def accuracy(y_hat, y): #@save
217
+ """Compute the number of correct predictions."""
218
+ if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
219
+ y_hat = d2l.argmax(y_hat, axis=1)
220
+ cmp = d2l.astype(y_hat, y.dtype) == y
221
+ return float(d2l.reduce_sum(d2l.astype(cmp, y.dtype)))
222
+
223
+
224
+ # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
225
+ def evaluate_accuracy(net, data_iter): #@save
226
+ """Compute the accuracy for a model on a dataset."""
227
+ if isinstance(net, torch.nn.Module):
228
+ net.eval() # Set the model to evaluation mode
229
+ metric = Accumulator(2) # No. of correct predictions, no. of predictions
230
+ for _, (X, y) in enumerate(data_iter):
231
+ metric.add(accuracy(net(X), y), d2l.size(y))
232
+ return metric[0] / metric[1]
233
+
234
+
235
+ # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
236
+ class Accumulator: #@save
237
+ """For accumulating sums over `n` variables."""
238
+ def __init__(self, n):
239
+ self.data = [0.0] * n
240
+
241
+ def add(self, *args):
242
+ self.data = [a + float(b) for a, b in zip(self.data, args)]
243
+
244
+ def reset(self):
245
+ self.data = [0.0] * len(self.data)
246
+
247
+ def __getitem__(self, idx):
248
+ return self.data[idx]
249
+
250
+
251
+ # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
252
+ def train_epoch_ch3(net, train_iter, loss, updater): #@save
253
+ """The training loop defined in Chapter 3."""
254
+ # Set the model to training mode
255
+ if isinstance(net, torch.nn.Module):
256
+ net.train()
257
+ # Sum of training loss, sum of training accuracy, no. of examples
258
+ metric = Accumulator(3)
259
+ for X, y in train_iter:
260
+ # Compute gradients and update parameters
261
+ y_hat = net(X)
262
+ l = loss(y_hat, y)
263
+ if isinstance(updater, torch.optim.Optimizer):
264
+ updater.zero_grad()
265
+ l.backward()
266
+ updater.step()
267
+ metric.add(float(l) * len(y), accuracy(y_hat, y),
268
+ y.size().numel())
269
+ else:
270
+ l.sum().backward()
271
+ updater(X.shape[0])
272
+ metric.add(float(l.sum()), accuracy(y_hat, y), y.numel())
273
+ # Return training loss and training accuracy
274
+ return metric[0] / metric[2], metric[1] / metric[2]
275
+
276
+
277
+ # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
278
+ class Animator: #@save
279
+ """For plotting data in animation."""
280
+ def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
281
+ ylim=None, xscale='linear', yscale='linear',
282
+ fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
283
+ figsize=(3.5, 2.5)):
284
+ # Incrementally plot multiple lines
285
+ if legend is None:
286
+ legend = []
287
+ d2l.use_svg_display()
288
+ self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize)
289
+ if nrows * ncols == 1:
290
+ self.axes = [self.axes, ]
291
+ # Use a lambda function to capture arguments
292
+ self.config_axes = lambda: d2l.set_axes(
293
+ self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
294
+ self.X, self.Y, self.fmts = None, None, fmts
295
+
296
+
297
+
298
+ def add(self, x, y):
299
+ # Add multiple data points into the figure
300
+ if not hasattr(y, "__len__"):
301
+ y = [y]
302
+ n = len(y)
303
+ if not hasattr(x, "__len__"):
304
+ x = [x] * n
305
+ if not self.X:
306
+ self.X = [[] for _ in range(n)]
307
+ if not self.Y:
308
+ self.Y = [[] for _ in range(n)]
309
+ for i, (a, b) in enumerate(zip(x, y)):
310
+ if a is not None and b is not None:
311
+ self.X[i].append(a)
312
+ self.Y[i].append(b)
313
+ self.axes[0].cla()
314
+ for x, y, fmt in zip(self.X, self.Y, self.fmts):
315
+ self.axes[0].plot(x, y, fmt)
316
+ self.config_axes()
317
+ display.display(self.fig)
318
+ display.clear_output(wait=True)
319
+
320
+
321
+ # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
322
+ def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater): #@save
323
+ """Train a model (defined in Chapter 3)."""
324
+ animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9],
325
+ legend=['train loss', 'train acc', 'test acc'])
326
+ for epoch in range(num_epochs):
327
+ train_metrics = train_epoch_ch3(net, train_iter, loss, updater)
328
+ test_acc = evaluate_accuracy(net, test_iter)
329
+ animator.add(epoch + 1, train_metrics + (test_acc,))
330
+ train_loss, train_acc = train_metrics
331
+ assert train_loss < 0.5, train_loss
332
+ assert train_acc <= 1 and train_acc > 0.7, train_acc
333
+ assert test_acc <= 1 and test_acc > 0.7, test_acc
334
+
335
+
336
+ # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
337
+ def predict_ch3(net, test_iter, n=6): #@save
338
+ """Predict labels (defined in Chapter 3)."""
339
+ for X, y in test_iter:
340
+ break
341
+ trues = d2l.get_fashion_mnist_labels(y)
342
+ preds = d2l.get_fashion_mnist_labels(d2l.argmax(net(X), axis=1))
343
+ titles = [true +'\n' + pred for true, pred in zip(trues, preds)]
344
+ d2l.show_images(d2l.reshape(X[0:n], (n, 28, 28)), 1, n, titles=titles[0:n])
345
+
346
+
347
+ # Defined in file: ./chapter_multilayer-perceptrons/underfit-overfit.md
348
+ def evaluate_loss(net, data_iter, loss): #@save
349
+ """Evaluate the loss of a model on the given dataset."""
350
+ metric = d2l.Accumulator(2) # Sum of losses, no. of examples
351
+ for X, y in data_iter:
352
+ l = loss(net(X), y)
353
+ metric.add(d2l.reduce_sum(l), d2l.size(l))
354
+ return metric[0] / metric[1]
355
+
356
+
357
+ # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
358
+ DATA_HUB = dict() #@save
359
+ DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/' #@save
360
+
361
+
362
+ # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
363
+ DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/' #@save
364
+
365
+
366
+ # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
367
+ def download(name, cache_dir=os.path.join('..', 'data')): #@save
368
+ """Download a file inserted into DATA_HUB, return the local filename."""
369
+ assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}."
370
+ url, sha1_hash = DATA_HUB[name]
371
+ d2l.mkdir_if_not_exist(cache_dir)
372
+ fname = os.path.join(cache_dir, url.split('/')[-1])
373
+ if os.path.exists(fname):
374
+ sha1 = hashlib.sha1()
375
+ with open(fname, 'rb') as f:
376
+ while True:
377
+ data = f.read(1048576)
378
+ if not data:
379
+ break
380
+ sha1.update(data)
381
+ if sha1.hexdigest() == sha1_hash:
382
+ return fname # Hit cache
383
+ print(f'Downloading {fname} from {url}...')
384
+ r = requests.get(url, stream=True, verify=True)
385
+ with open(fname, 'wb') as f:
386
+ f.write(r.content)
387
+ return fname
388
+
389
+
390
+ # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
391
+ def download_extract(name, folder=None): #@save
392
+ """Download and extract a zip/tar file."""
393
+ fname = download(name)
394
+ base_dir = os.path.dirname(fname)
395
+ data_dir, ext = os.path.splitext(fname)
396
+ if ext == '.zip':
397
+ fp = zipfile.ZipFile(fname, 'r')
398
+ elif ext in ('.tar', '.gz'):
399
+ fp = tarfile.open(fname, 'r')
400
+ else:
401
+ assert False, 'Only zip/tar files can be extracted.'
402
+ fp.extractall(base_dir)
403
+ return os.path.join(base_dir, folder) if folder else data_dir
404
+
405
+
406
+ # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
407
+ def download_all(): #@save
408
+ """Download all files in the DATA_HUB."""
409
+ for name in DATA_HUB:
410
+ download(name)
411
+
412
+
413
+ # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
414
+ DATA_HUB['kaggle_house_train'] = ( #@save
415
+ DATA_URL + 'kaggle_house_pred_train.csv',
416
+ '585e9cc93e70b39160e7921475f9bcd7d31219ce')
417
+
418
+
419
+ # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
420
+ DATA_HUB['kaggle_house_test'] = ( #@save
421
+ DATA_URL + 'kaggle_house_pred_test.csv',
422
+ 'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
423
+
424
+
425
+ # Defined in file: ./chapter_deep-learning-computation/use-gpu.md
426
+ def try_gpu(i=0): #@save
427
+ """Return gpu(i) if exists, otherwise return cpu()."""
428
+ if torch.cuda.device_count() >= i + 1:
429
+ return torch.device(f'cuda:{i}')
430
+ return torch.device('cpu')
431
+
432
+
433
+ # Defined in file: ./chapter_deep-learning-computation/use-gpu.md
434
+ def try_all_gpus(): #@save
435
+ """Return all available GPUs, or [cpu(),] if no GPU exists."""
436
+ ctxes = [torch.device(f'cuda:{i}')
437
+ for i in range(torch.cuda.device_count())]
438
+ return ctxes if ctxes else [torch.device('cpu')]
439
+
440
+
441
+ # Defined in file: ./chapter_convolutional-neural-networks/conv-layer.md
442
+ def corr2d(X, K): #@save
443
+ """Compute 2D cross-correlation."""
444
+ h, w = K.shape
445
+ Y = d2l.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
446
+ for i in range(Y.shape[0]):
447
+ for j in range(Y.shape[1]):
448
+ Y[i, j] = d2l.reduce_sum((X[i: i + h, j: j + w] * K))
449
+ return Y
450
+
451
+
452
+ # Defined in file: ./chapter_convolutional-neural-networks/lenet.md
453
+ def evaluate_accuracy_gpu(net, data_iter, device=None): #@save
454
+ net.eval() # Set the model to evaluation mode
455
+ if not device:
456
+ device = next(iter(net.parameters())).device
457
+ metric = d2l.Accumulator(2) # num_corrected_examples, num_examples
458
+ for X, y in data_iter:
459
+ X, y = X.to(device), y.to(device)
460
+ metric.add(d2l.accuracy(net(X), y), d2l.size(y))
461
+ return metric[0] / metric[1]
462
+
463
+
464
+ # Defined in file: ./chapter_convolutional-neural-networks/lenet.md
465
+ def train_ch6(net, train_iter, test_iter, num_epochs, lr,
466
+ device=d2l.try_gpu()):
467
+ """Train and evaluate a model with CPU or GPU."""
468
+ def init_weights(m):
469
+ if type(m) == nn.Linear or type(m) == nn.Conv2d:
470
+ torch.nn.init.xavier_uniform_(m.weight)
471
+ net.apply(init_weights)
472
+ print('training on', device)
473
+ net.to(device)
474
+ optimizer = torch.optim.SGD(net.parameters(), lr=lr)
475
+ loss = nn.CrossEntropyLoss()
476
+ animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs],
477
+ legend=['train loss', 'train acc', 'test acc'])
478
+ timer = d2l.Timer()
479
+ for epoch in range(num_epochs):
480
+ metric = d2l.Accumulator(3) # train_loss, train_acc, num_examples
481
+ for i, (X, y) in enumerate(train_iter):
482
+ timer.start()
483
+ net.train()
484
+ optimizer.zero_grad()
485
+ X, y = X.to(device), y.to(device)
486
+ y_hat = net(X)
487
+ l = loss(y_hat, y)
488
+ l.backward()
489
+ optimizer.step()
490
+ with torch.no_grad():
491
+ metric.add(l*X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
492
+ timer.stop()
493
+ train_loss, train_acc = metric[0]/metric[2], metric[1]/metric[2]
494
+ if (i+1) % 50 == 0:
495
+ animator.add(epoch + i/len(train_iter),
496
+ (train_loss, train_acc, None))
497
+ test_acc = evaluate_accuracy_gpu(net, test_iter)
498
+ animator.add(epoch+1, (None, None, test_acc))
499
+ print(f'loss {train_loss:.3f}, train acc {train_acc:.3f}, '
500
+ f'test acc {test_acc:.3f}')
501
+ print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
502
+ f'on {str(device)}')
503
+
504
+
505
+ # Defined in file: ./chapter_convolutional-modern/resnet.md
506
+ class Residual(nn.Module): #@save
507
+ def __init__(self, input_channels, num_channels,
508
+ use_1x1conv=False, strides=1):
509
+ super().__init__()
510
+ self.conv1 = nn.Conv2d(input_channels, num_channels,
511
+ kernel_size=3, padding=1, stride=strides)
512
+ self.conv2 = nn.Conv2d(num_channels, num_channels,
513
+ kernel_size=3, padding=1)
514
+ if use_1x1conv:
515
+ self.conv3 = nn.Conv2d(input_channels, num_channels,
516
+ kernel_size=1, stride=strides)
517
+ else:
518
+ self.conv3 = None
519
+ self.bn1 = nn.BatchNorm2d(num_channels)
520
+ self.bn2 = nn.BatchNorm2d(num_channels)
521
+ self.relu = nn.ReLU(inplace=True)
522
+
523
+ def forward(self, X):
524
+ Y = F.relu(self.bn1(self.conv1(X)))
525
+ Y = self.bn2(self.conv2(Y))
526
+ if self.conv3:
527
+ X = self.conv3(X)
528
+ Y += X
529
+ return F.relu(Y)
530
+
531
+
532
+ # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
533
+ d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
534
+ '090b5e7e70c295757f55df93cb0a180b9691891a')
535
+
536
+
537
+ # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
538
+ def read_time_machine(): #@save
539
+ """Load the time machine book into a list of sentences."""
540
+ with open(d2l.download('time_machine'), 'r') as f:
541
+ lines = f.readlines()
542
+ return [re.sub('[^A-Za-z]+', ' ', line.strip().lower())
543
+ for line in lines]
544
+
545
+
546
+ # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
547
+ def tokenize(lines, token='word'): #@save
548
+ """Split sentences into word or char tokens."""
549
+ if token == 'word':
550
+ return [line.split(' ') for line in lines]
551
+ elif token == 'char':
552
+ return [list(line) for line in lines]
553
+ else:
554
+ print('ERROR: unknown token type '+token)
555
+
556
+
557
+ # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
558
+ class Vocab: #@save
559
+ def __init__(self, tokens, min_freq=0, reserved_tokens=None):
560
+ if reserved_tokens is None:
561
+ reserved_tokens = []
562
+ # Sort according to frequencies
563
+ counter = count_corpus(tokens)
564
+ self.token_freqs = sorted(counter.items(), key=lambda x: x[0])
565
+ self.token_freqs.sort(key=lambda x: x[1], reverse=True)
566
+ self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
567
+ uniq_tokens += [token for token, freq in self.token_freqs
568
+ if freq >= min_freq and token not in uniq_tokens]
569
+ self.idx_to_token, self.token_to_idx = [], dict()
570
+ for token in uniq_tokens:
571
+ self.idx_to_token.append(token)
572
+ self.token_to_idx[token] = len(self.idx_to_token) - 1
573
+
574
+ def __len__(self):
575
+ return len(self.idx_to_token)
576
+
577
+ def __getitem__(self, tokens):
578
+ if not isinstance(tokens, (list, tuple)):
579
+ return self.token_to_idx.get(tokens, self.unk)
580
+ return [self.__getitem__(token) for token in tokens]
581
+
582
+ def to_tokens(self, indices):
583
+ if not isinstance(indices, (list, tuple)):
584
+ return self.idx_to_token[indices]
585
+ return [self.idx_to_token[index] for index in indices]
586
+
587
+
588
+ # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
589
+ def count_corpus(sentences): #@save
590
+ # Flatten a list of token lists into a list of tokens
591
+ tokens = [tk for line in sentences for tk in line]
592
+ return collections.Counter(tokens)
593
+
594
+
595
+ # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
596
+ def load_corpus_time_machine(max_tokens=-1): #@save
597
+ lines = read_time_machine()
598
+ tokens = tokenize(lines, 'char')
599
+ vocab = Vocab(tokens)
600
+ corpus = [vocab[tk] for line in tokens for tk in line]
601
+ if max_tokens > 0:
602
+ corpus = corpus[:max_tokens]
603
+ return corpus, vocab
604
+
605
+
606
+ # Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md
607
+ def seq_data_iter_random(corpus, batch_size, num_steps): #@save
608
+ # Offset the iterator over the data for uniform starts
609
+ corpus = corpus[random.randint(0, num_steps):]
610
+ # Subtract 1 extra since we need to account for label
611
+ num_examples = ((len(corpus) - 1) // num_steps)
612
+ example_indices = list(range(0, num_examples * num_steps, num_steps))
613
+ random.shuffle(example_indices)
614
+
615
+ def data(pos):
616
+ # This returns a sequence of length `num_steps` starting from `pos`
617
+ return corpus[pos: pos + num_steps]
618
+
619
+ # Discard half empty batches
620
+ num_batches = num_examples // batch_size
621
+ for i in range(0, batch_size * num_batches, batch_size):
622
+ # `batch_size` indicates the random examples read each time
623
+ batch_indices = example_indices[i:(i+batch_size)]
624
+ X = [data(j) for j in batch_indices]
625
+ Y = [data(j + 1) for j in batch_indices]
626
+ yield d2l.tensor(X), d2l.tensor(Y)
627
+
628
+
629
+ # Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md
630
+ def seq_data_iter_consecutive(corpus, batch_size, num_steps): #@save
631
+ # Offset for the iterator over the data for uniform starts
632
+ offset = random.randint(0, num_steps)
633
+ # Slice out data: ignore `num_steps` and just wrap around
634
+ num_indices = ((len(corpus) - offset - 1) // batch_size) * batch_size
635
+ Xs = d2l.tensor(corpus[offset:offset+num_indices])
636
+ Ys = d2l.tensor(corpus[offset+1:offset+1+num_indices])
637
+ Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
638
+ num_batches = Xs.shape[1] // num_steps
639
+ for i in range(0, num_batches * num_steps, num_steps):
640
+ X = Xs[:, i:(i+num_steps)]
641
+ Y = Ys[:, i:(i+num_steps)]
642
+ yield X, Y
643
+
644
+
645
+ # Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md
646
+ class SeqDataLoader: #@save
647
+ """A iterator to load sequence data."""
648
+ def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
649
+ if use_random_iter:
650
+ self.data_iter_fn = d2l.seq_data_iter_random
651
+ else:
652
+ self.data_iter_fn = d2l.seq_data_iter_consecutive
653
+ self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens)
654
+ self.batch_size, self.num_steps = batch_size, num_steps
655
+
656
+ def __iter__(self):
657
+ return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)
658
+
659
+
660
+ # Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md
661
+ def load_data_time_machine(batch_size, num_steps, #@save
662
+ use_random_iter=False, max_tokens=10000):
663
+ data_iter = SeqDataLoader(
664
+ batch_size, num_steps, use_random_iter, max_tokens)
665
+ return data_iter, data_iter.vocab
666
+
667
+
668
+ # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
669
+ class RNNModelScratch: #@save
670
+ """A RNN Model based on scratch implementations."""
671
+ def __init__(self, vocab_size, num_hiddens, device,
672
+ get_params, init_state, forward):
673
+ self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
674
+ self.params = get_params(vocab_size, num_hiddens, device)
675
+ self.init_state, self.forward_fn = init_state, forward
676
+
677
+ def __call__(self, X, state):
678
+ X = F.one_hot(X.T.long(), self.vocab_size).type(torch.float32)
679
+ return self.forward_fn(X, state, self.params)
680
+
681
+ def begin_state(self, batch_size, device):
682
+ return self.init_state(batch_size, self.num_hiddens, device)
683
+
684
+
685
+ # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
686
+ def predict_ch8(prefix, num_predicts, model, vocab, device): #@save
687
+ state = model.begin_state(batch_size=1, device=device)
688
+ outputs = [vocab[prefix[0]]]
689
+ get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape(1, 1)
690
+ for y in prefix[1:]: # Warmup state with prefix
691
+ _, state = model(get_input(), state)
692
+ outputs.append(vocab[y])
693
+ for _ in range(num_predicts): # Predict num_predicts steps
694
+ Y, state = model(get_input(), state)
695
+ outputs.append(int(Y.argmax(dim=1).reshape(1)))
696
+ return ''.join([vocab.idx_to_token[i] for i in outputs])
697
+
698
+
699
+ # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
700
+ def grad_clipping(model, theta): #@save
701
+ if isinstance(model, nn.Module):
702
+ params = [p for p in model.parameters() if p.requires_grad]
703
+ else:
704
+ params = model.params
705
+ norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
706
+ if norm > theta:
707
+ for param in params:
708
+ param.grad[:] *= theta / norm
709
+
710
+
711
+ # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
712
+ def train_epoch_ch8(model, train_iter, loss, updater, device, use_random_iter): #@save
713
+ state, timer = None, d2l.Timer()
714
+ metric = d2l.Accumulator(2) # loss_sum, num_examples
715
+ for X, Y in train_iter:
716
+ if state is None or use_random_iter:
717
+ # Initialize state when either it is the first iteration or
718
+ # using random sampling.
719
+ state = model.begin_state(batch_size=X.shape[0], device=device)
720
+ else:
721
+ for s in state:
722
+ s.detach_()
723
+ y = Y.T.reshape(-1)
724
+ X, y = X.to(device), y.to(device)
725
+ py, state = model(X, state)
726
+ l = loss(py, y.long()).mean()
727
+ if isinstance(updater, torch.optim.Optimizer):
728
+ updater.zero_grad()
729
+ l.backward()
730
+ grad_clipping(model, 1)
731
+ updater.step()
732
+ else:
733
+ l.backward()
734
+ grad_clipping(model, 1)
735
+ updater(batch_size=1) # Since used mean already
736
+ metric.add(l * d2l.size(y), d2l.size(y))
737
+ return math.exp(metric[0]/metric[1]), metric[1]/timer.stop()
738
+
739
+
740
+ # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
741
+ def train_ch8(model, train_iter, vocab, lr, num_epochs, device,
742
+ use_random_iter=False):
743
+ # Initialize
744
+ loss = nn.CrossEntropyLoss()
745
+ animator = d2l.Animator(xlabel='epoch', ylabel='perplexity',
746
+ legend=['train'], xlim=[1, num_epochs])
747
+ if isinstance(model, nn.Module):
748
+ trainer = torch.optim.SGD(model.parameters(), lr)
749
+ updater = lambda batch_size: trainer.step()
750
+ else:
751
+ updater = lambda batch_size: d2l.sgd(model.params, lr, batch_size)
752
+ predict = lambda prefix: predict_ch8(prefix, 50, model, vocab, device)
753
+ # Train and check the progress.
754
+ for epoch in range(num_epochs):
755
+ ppl, speed = train_epoch_ch8(
756
+ model, train_iter, loss, updater, device, use_random_iter)
757
+ if epoch % 10 == 0:
758
+ print(predict('time traveller'))
759
+ animator.add(epoch+1, [ppl])
760
+ print(f'perplexity {ppl:.1f}, {speed:.1f} tokens/sec on {str(device)}')
761
+ print(predict('time traveller'))
762
+ print(predict('traveller'))
763
+
764
+
765
+ # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
766
+ d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip',
767
+ '94646ad1522d915e7b0f9296181140edcf86a4f5')
768
+
769
+
770
+ # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
771
+ def read_data_nmt():
772
+ data_dir = d2l.download_extract('fra-eng')
773
+ with open(os.path.join(data_dir, 'fra.txt'), 'r') as f:
774
+ return f.read()
775
+
776
+
777
+ # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
778
+ def preprocess_nmt(text):
779
+ def no_space(char, prev_char):
780
+ return char in set(',.!') and prev_char != ' '
781
+
782
+ text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()
783
+ out = [' ' + char if i > 0 and no_space(char, text[i-1]) else char
784
+ for i, char in enumerate(text)]
785
+ return ''.join(out)
786
+
787
+
788
+ # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
789
+ def tokenize_nmt(text, num_examples=None):
790
+ source, target = [], []
791
+ for i, line in enumerate(text.split('\n')):
792
+ if num_examples and i > num_examples:
793
+ break
794
+ parts = line.split('\t')
795
+ if len(parts) == 2:
796
+ source.append(parts[0].split(' '))
797
+ target.append(parts[1].split(' '))
798
+ return source, target
799
+
800
+
801
+ # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
802
+ def truncate_pad(line, num_steps, padding_token):
803
+ if len(line) > num_steps:
804
+ return line[:num_steps] # Trim
805
+ return line + [padding_token] * (num_steps - len(line)) # Pad
806
+
807
+
808
+ # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
809
+ def build_array(lines, vocab, num_steps, is_source):
810
+ lines = [vocab[l] for l in lines]
811
+ if not is_source:
812
+ lines = [[vocab['<bos>']] + l + [vocab['<eos>']] for l in lines]
813
+ array = torch.tensor([truncate_pad(
814
+ l, num_steps, vocab['<pad>']) for l in lines])
815
+ valid_len = (array != vocab['<pad>']).sum(dim=1)
816
+ return array, valid_len
817
+
818
+
819
+ # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
820
+ def load_data_nmt(batch_size, num_steps, num_examples=1000):
821
+ text = preprocess_nmt(read_data_nmt())
822
+ source, target = tokenize_nmt(text, num_examples)
823
+ src_vocab = d2l.Vocab(source, min_freq=3,
824
+ reserved_tokens=['<pad>', '<bos>', '<eos>'])
825
+ tgt_vocab = d2l.Vocab(target, min_freq=3,
826
+ reserved_tokens=['<pad>', '<bos>', '<eos>'])
827
+ src_array, src_valid_len = build_array(
828
+ source, src_vocab, num_steps, True)
829
+ tgt_array, tgt_valid_len = build_array(
830
+ target, tgt_vocab, num_steps, False)
831
+ data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
832
+ data_iter = d2l.load_array(data_arrays, batch_size)
833
+ return src_vocab, tgt_vocab, data_iter
834
+
835
+
836
+ # Defined in file: ./chapter_recurrent-modern/encoder-decoder.md
837
+ class Encoder(nn.Module):
838
+ """The base encoder interface for the encoder-decoder architecture."""
839
+ def __init__(self, **kwargs):
840
+ super(Encoder, self).__init__(**kwargs)
841
+
842
+ def forward(self, X, *args):
843
+ raise NotImplementedError
844
+
845
+
846
+ # Defined in file: ./chapter_recurrent-modern/encoder-decoder.md
847
+ class Decoder(nn.Module):
848
+ """The base decoder interface for the encoder-decoder architecture."""
849
+ def __init__(self, **kwargs):
850
+ super(Decoder, self).__init__(**kwargs)
851
+
852
+ def init_state(self, enc_outputs, *args):
853
+ raise NotImplementedError
854
+
855
+ def forward(self, X, state):
856
+ raise NotImplementedError
857
+
858
+
859
+ # Defined in file: ./chapter_recurrent-modern/encoder-decoder.md
860
+ class EncoderDecoder(nn.Module):
861
+ """The base class for the encoder-decoder architecture."""
862
+ def __init__(self, encoder, decoder, **kwargs):
863
+ super(EncoderDecoder, self).__init__(**kwargs)
864
+ self.encoder = encoder
865
+ self.decoder = decoder
866
+
867
+ def forward(self, enc_X, dec_X, *args):
868
+ enc_outputs = self.encoder(enc_X, *args)
869
+ dec_state = self.decoder.init_state(enc_outputs, *args)
870
+ return self.decoder(dec_X, dec_state)
871
+
872
+
873
+ # Defined in file: ./chapter_recurrent-modern/seq2seq.md
874
+ class Seq2SeqEncoder(d2l.Encoder):
875
+ def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
876
+ dropout=0, **kwargs):
877
+ super(Seq2SeqEncoder, self).__init__(**kwargs)
878
+ self.embedding = nn.Embedding(vocab_size, embed_size)
879
+ self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, dropout=dropout)
880
+
881
+ def forward(self, X, *args):
882
+ X = self.embedding(X) # X shape: (batch_size, seq_len, embed_size)
883
+ # RNN needs first axes to be timestep, i.e., seq_len
884
+ X = X.permute(1, 0, 2)
885
+ out, state = self.rnn(X) # When state is not mentioned, it defaults to zeros
886
+ # out shape: (seq_len, batch_size, num_hiddens)
887
+ # state shape: (num_layers, batch_size, num_hiddens),
888
+ # where "state" contains the hidden state and the memory cell
889
+ return out, state
890
+
891
+
892
+ # Defined in file: ./chapter_recurrent-modern/seq2seq.md
893
+ class Seq2SeqDecoder(d2l.Decoder):
894
+ def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
895
+ dropout=0, **kwargs):
896
+ super(Seq2SeqDecoder, self).__init__(**kwargs)
897
+ self.embedding = nn.Embedding(vocab_size, embed_size)
898
+ self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, dropout=dropout)
899
+ self.dense = nn.Linear(num_hiddens, vocab_size)
900
+
901
+ def init_state(self, enc_outputs, *args):
902
+ return enc_outputs[1]
903
+
904
+ def forward(self, X, state):
905
+ X = self.embedding(X).permute(1, 0, 2)
906
+ out, state = self.rnn(X, state)
907
+ # Make the batch to be the first dimension to simplify loss computation
908
+ out = self.dense(out).permute(1, 0, 2)
909
+ return out, state
910
+
911
+
912
+ # Defined in file: ./chapter_recurrent-modern/seq2seq.md
913
+ def sequence_mask(X, valid_len, value=0):
914
+ output = X.clone()
915
+ for count, matrix in enumerate(output):
916
+ matrix[int(valid_len[count]):]=value
917
+ return output
918
+
919
+
920
+ # Defined in file: ./chapter_recurrent-modern/seq2seq.md
921
+ class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
922
+ # pred shape: (batch_size, seq_len, vocab_size)
923
+ # label shape: (batch_size, seq_len)
924
+ # valid_len shape: (batch_size, )
925
+ def forward(self, pred, label, valid_len):
926
+ weights = torch.ones_like(label)
927
+ weights = sequence_mask(weights, valid_len)
928
+ self.reduction='none'
929
+ unweighted_loss = super(MaskedSoftmaxCELoss, self).forward(pred.permute(0,2,1), label)
930
+ weighted_loss = (unweighted_loss*weights).mean(dim=1)
931
+ return weighted_loss
932
+
933
+
934
+ # Defined in file: ./chapter_recurrent-modern/seq2seq.md
935
+ def train_s2s_ch9(model, data_iter, lr, num_epochs, device):
936
+ def xavier_init_weights(m):
937
+ if type(m) == nn.Linear:
938
+ torch.nn.init.xavier_uniform_(m.weight)
939
+ if type(m) == nn.LSTM:
940
+ for param in m._flat_weights_names:
941
+ if "weight" in param:
942
+ torch.nn.init.xavier_uniform_(m._parameters[param])
943
+ model.apply(xavier_init_weights)
944
+ model.to(device)
945
+ optimizer = torch.optim.Adam(model.parameters(), lr=lr)
946
+ loss = MaskedSoftmaxCELoss()
947
+ model.train()
948
+ animator = d2l.Animator(xlabel='epoch', ylabel='loss',
949
+ xlim=[1, num_epochs], ylim=[0, 0.25])
950
+ for epoch in range(1, num_epochs + 1):
951
+ timer = d2l.Timer()
952
+ metric = d2l.Accumulator(2) # loss_sum, num_tokens
953
+ for batch in data_iter:
954
+ X, X_vlen, Y, Y_vlen = [x.to(device) for x in batch]
955
+ Y_input, Y_label, Y_vlen = Y[:, :-1], Y[:, 1:], Y_vlen-1
956
+ Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen)
957
+ l = loss(Y_hat, Y_label, Y_vlen)
958
+ l.sum().backward() # Making the loss scalar for backward()
959
+ d2l.grad_clipping(model, 1)
960
+ num_tokens = Y_vlen.sum()
961
+ optimizer.step()
962
+ with torch.no_grad():
963
+ metric.add(l.sum(), num_tokens)
964
+ if epoch % 10 == 0:
965
+ animator.add(epoch, (metric[0]/metric[1],))
966
+ print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
967
+ f'tokens/sec on {str(device)}')
968
+
969
+
970
+ # Defined in file: ./chapter_recurrent-modern/seq2seq.md
971
+ def predict_s2s_ch9(model, src_sentence, src_vocab, tgt_vocab, num_steps,
972
+ device):
973
+ src_tokens = src_vocab[src_sentence.lower().split(' ')]
974
+ enc_valid_len = torch.tensor([len(src_tokens)], device=device)
975
+ src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
976
+ enc_X = torch.tensor(src_tokens, dtype=torch.long, device=device)
977
+ # Add the batch size dimension
978
+ enc_outputs = model.encoder(torch.unsqueeze(enc_X, dim=0),
979
+ enc_valid_len)
980
+ dec_state = model.decoder.init_state(enc_outputs, enc_valid_len)
981
+ dec_X = torch.unsqueeze(torch.tensor([tgt_vocab['<bos>']], dtype=torch.long, device=device), dim=0)
982
+ predict_tokens = []
983
+ for _ in range(num_steps):
984
+ Y, dec_state = model.decoder(dec_X, dec_state)
985
+ # The token with highest score is used as the next timestep input
986
+ dec_X = Y.argmax(dim=2)
987
+ py = dec_X.squeeze(dim=0).type(torch.int32).item()
988
+ if py == tgt_vocab['<eos>']:
989
+ break
990
+ predict_tokens.append(py)
991
+ return ' '.join(tgt_vocab.to_tokens(predict_tokens))
992
+
993
+
994
+ # Defined in file: ./chapter_attention-mechanisms/attention.md
995
+ def masked_softmax(X, valid_len):
996
+ """Perform softmax by filtering out some elements."""
997
+ # X: 3-D tensor, valid_len: 1-D or 2-D tensor
998
+ if valid_len is None:
999
+ return nn.functional.softmax(X, dim=-1)
1000
+ else:
1001
+ shape = X.shape
1002
+ if valid_len.dim() == 1:
1003
+ valid_len = torch.repeat_interleave(valid_len, repeats=shape[1],
1004
+ dim=0)
1005
+ else:
1006
+ valid_len = valid_len.reshape(-1)
1007
+ # Fill masked elements with a large negative, whose exp is 0
1008
+ X = d2l.sequence_mask(X.reshape(-1, shape[-1]), valid_len, value=-1e6)
1009
+ return nn.functional.softmax(X.reshape(shape), dim=-1)
1010
+
1011
+
1012
+ # Defined in file: ./chapter_attention-mechanisms/attention.md
1013
+ class DotProductAttention(nn.Module):
1014
+ def __init__(self, dropout, **kwargs):
1015
+ super(DotProductAttention, self).__init__(**kwargs)
1016
+ self.dropout = nn.Dropout(dropout)
1017
+
1018
+ # `query`: (`batch_size`, #queries, `d`)
1019
+ # `key`: (`batch_size`, #kv_pairs, `d`)
1020
+ # `value`: (`batch_size`, #kv_pairs, `dim_v`)
1021
+ # `valid_len`: either (`batch_size`, ) or (`batch_size`, xx)
1022
+ def forward(self, query, key, value, valid_len=None):
1023
+ d = query.shape[-1]
1024
+ # Set transpose_b=True to swap the last two dimensions of key
1025
+ scores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(d)
1026
+ attention_weights = self.dropout(masked_softmax(scores, valid_len))
1027
+ return torch.bmm(attention_weights, value)
1028
+
1029
+
1030
+ # Defined in file: ./chapter_attention-mechanisms/attention.md
1031
+ class MLPAttention(nn.Module):
1032
+ def __init__(self, key_size, query_size, units, dropout, **kwargs):
1033
+ super(MLPAttention, self).__init__(**kwargs)
1034
+ self.W_k = nn.Linear(key_size, units, bias=False)
1035
+ self.W_q = nn.Linear(query_size, units, bias=False)
1036
+ self.v = nn.Linear(units, 1, bias=False)
1037
+ self.dropout = nn.Dropout(dropout)
1038
+
1039
+ def forward(self, query, key, value, valid_len):
1040
+ query, key = self.W_k(query), self.W_q(key)
1041
+ # Expand query to (`batch_size`, #queries, 1, units), and key to
1042
+ # (`batch_size`, 1, #kv_pairs, units). Then plus them with broadcast
1043
+ features = query.unsqueeze(2) + key.unsqueeze(1)
1044
+ scores = self.v(features).squeeze(-1)
1045
+ attention_weights = self.dropout(masked_softmax(scores, valid_len))
1046
+ return torch.bmm(attention_weights, value)
1047
+
1048
+
1049
+ # Defined in file: ./chapter_optimization/optimization-intro.md
1050
+ def annotate(text, xy, xytext): #@save
1051
+ d2l.plt.gca().annotate(text, xy=xy, xytext=xytext,
1052
+ arrowprops=dict(arrowstyle='->'))
1053
+
1054
+
1055
+ # Defined in file: ./chapter_optimization/gd.md
1056
+ def train_2d(trainer, steps=20): #@save
1057
+ """Optimize a 2-dim objective function with a customized trainer."""
1058
+ # s1 and s2 are internal state variables and will
1059
+ # be used later in the chapter
1060
+ x1, x2, s1, s2 = -5, -2, 0, 0
1061
+ results = [(x1, x2)]
1062
+ for i in range(steps):
1063
+ x1, x2, s1, s2 = trainer(x1, x2, s1, s2)
1064
+ results.append((x1, x2))
1065
+ return results
1066
+
1067
+
1068
+ # Defined in file: ./chapter_optimization/gd.md
1069
+ def show_trace_2d(f, results): #@save
1070
+ """Show the trace of 2D variables during optimization."""
1071
+ d2l.set_figsize()
1072
+ d2l.plt.plot(*zip(*results), '-o', color='#ff7f0e')
1073
+ x1, x2 = d2l.meshgrid(d2l.arange(-5.5, 1.0, 0.1),
1074
+ d2l.arange(-3.0, 1.0, 0.1))
1075
+ d2l.plt.contour(x1, x2, f(x1, x2), colors='#1f77b4')
1076
+ d2l.plt.xlabel('x1')
1077
+ d2l.plt.ylabel('x2')
1078
+
1079
+
1080
+ # Alias defined in config.ini
1081
+
1082
+
1083
+ ones = torch.ones
1084
+ zeros = torch.zeros
1085
+ tensor = torch.tensor
1086
+ arange = torch.arange
1087
+ meshgrid = torch.meshgrid
1088
+ sin = torch.sin
1089
+ sinh = torch.sinh
1090
+ cos = torch.cos
1091
+ cosh = torch.cosh
1092
+ tanh = torch.tanh
1093
+ linspace = torch.linspace
1094
+ exp = torch.exp
1095
+ log = torch.log
1096
+ normal = torch.normal
1097
+ matmul = torch.matmul
1098
+ int32 = torch.int32
1099
+ float32 = torch.float32
1100
+ concat = torch.cat
1101
+ stack = torch.stack
1102
+ abs = torch.abs
1103
+ numpy = lambda x, *args, **kwargs: x.detach().numpy(*args, **kwargs)
1104
+ size = lambda x, *args, **kwargs: x.numel(*args, **kwargs)
1105
+ reshape = lambda x, *args, **kwargs: x.reshape(*args, **kwargs)
1106
+ to = lambda x, *args, **kwargs: x.to(*args, **kwargs)
1107
+ reduce_sum = lambda x, *args, **kwargs: x.sum(*args, **kwargs)
1108
+ argmax = lambda x, *args, **kwargs: x.argmax(*args, **kwargs)
1109
+ astype = lambda x, *args, **kwargs: x.type(*args, **kwargs)
1110
+ transpose = lambda x, *args, **kwargs: x.t(*args, **kwargs)
1111
+