# This file is generated automatically through: # d2lbook build lib # Don't edit it directly # Defined in file: ./chapter_preface/index.md import collections from collections import defaultdict from IPython import display import math from matplotlib import pyplot as plt import os import pandas as pd import random import re import shutil import sys import tarfile import time import requests import zipfile import hashlib d2l = sys.modules[__name__] # Defined in file: ./chapter_preface/index.md import numpy as np import torch import torchvision from torch import nn from torch.nn import functional as F from torch.utils import data from torchvision import transforms # Defined in file: ./chapter_preliminaries/pandas.md def mkdir_if_not_exist(path): #@save """Make a directory if it does not exist.""" if not isinstance(path, str): path = os.path.join(*path) if not os.path.exists(path): os.makedirs(path) # Defined in file: ./chapter_preliminaries/calculus.md def use_svg_display(): #@save """Use the svg format to display a plot in Jupyter.""" display.set_matplotlib_formats('svg') # Defined in file: ./chapter_preliminaries/calculus.md def set_figsize(figsize=(3.5, 2.5)): #@save """Set the figure size for matplotlib.""" use_svg_display() d2l.plt.rcParams['figure.figsize'] = figsize # Defined in file: ./chapter_preliminaries/calculus.md def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend): """Set the axes for matplotlib.""" axes.set_xlabel(xlabel) axes.set_ylabel(ylabel) axes.set_xscale(xscale) axes.set_yscale(yscale) axes.set_xlim(xlim) axes.set_ylim(ylim) if legend: axes.legend(legend) axes.grid() # Defined in file: ./chapter_preliminaries/calculus.md def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, xscale='linear', yscale='linear', fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None): """Plot data points.""" if legend is None: legend = [] set_figsize(figsize) axes = axes if axes else d2l.plt.gca() # Return True if `X` (tensor or list) has 1 axis def has_one_axis(X): return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list) and not hasattr(X[0], "__len__")) if has_one_axis(X): X = [X] if Y is None: X, Y = [[]] * len(X), X elif has_one_axis(Y): Y = [Y] if len(X) != len(Y): X = X * len(Y) axes.cla() for x, y, fmt in zip(X, Y, fmts): if len(x): axes.plot(x, y, fmt) else: axes.plot(y, fmt) set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend) # Defined in file: ./chapter_linear-networks/linear-regression.md class Timer: #@save """Record multiple running times.""" def __init__(self): self.times = [] self.start() def start(self): """Start the timer.""" self.tik = time.time() def stop(self): """Stop the timer and record the time in a list.""" self.times.append(time.time() - self.tik) return self.times[-1] def avg(self): """Return the average time.""" return sum(self.times) / len(self.times) def sum(self): """Return the sum of time.""" return sum(self.times) def cumsum(self): """Return the accumulated time.""" return np.array(self.times).cumsum().tolist() # Defined in file: ./chapter_linear-networks/linear-regression-scratch.md def synthetic_data(w, b, num_examples): #@save """Generate y = Xw + b + noise.""" X = d2l.normal(0, 1, (num_examples, len(w))) y = d2l.matmul(X, w) + b y += d2l.normal(0, 0.01, y.shape) return X, d2l.reshape(y, (-1, 1)) # Defined in file: ./chapter_linear-networks/linear-regression-scratch.md def linreg(X, w, b): #@save """The linear regression model.""" return d2l.matmul(X, w) + b # Defined in file: ./chapter_linear-networks/linear-regression-scratch.md def squared_loss(y_hat, y): #@save """Squared loss.""" return (y_hat - d2l.reshape(y, y_hat.shape)) ** 2 / 2 # Defined in file: ./chapter_linear-networks/linear-regression-scratch.md def sgd(params, lr, batch_size): #@save """Minibatch stochastic gradient descent.""" for param in params: param.data.sub_(lr*param.grad/batch_size) param.grad.data.zero_() # Defined in file: ./chapter_linear-networks/linear-regression-concise.md def load_array(data_arrays, batch_size, is_train=True): #@save """Construct a PyTorch data iterator.""" dataset = data.TensorDataset(*data_arrays) return data.DataLoader(dataset, batch_size, shuffle=is_train) # Defined in file: ./chapter_linear-networks/image-classification-dataset.md def get_fashion_mnist_labels(labels): #@save """Return text labels for the Fashion-MNIST dataset.""" text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] return [text_labels[int(i)] for i in labels] # Defined in file: ./chapter_linear-networks/image-classification-dataset.md def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5): #@save """Plot a list of images.""" figsize = (num_cols * scale, num_rows * scale) _, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize) axes = axes.flatten() for i, (ax, img) in enumerate(zip(axes, imgs)): ax.imshow(d2l.numpy(img)) ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) if titles: ax.set_title(titles[i]) return axes # Defined in file: ./chapter_linear-networks/image-classification-dataset.md def get_dataloader_workers(): #@save """Use 4 processes to read the data.""" return 4 # Defined in file: ./chapter_linear-networks/image-classification-dataset.md def load_data_fashion_mnist(batch_size, resize=None): #@save """Download the Fashion-MNIST dataset and then load it into memory.""" trans = [transforms.ToTensor()] if resize: trans.insert(0, transforms.Resize(resize)) trans = transforms.Compose(trans) mnist_train = torchvision.datasets.FashionMNIST( root="../data", train=True, transform=trans, download=True) mnist_test = torchvision.datasets.FashionMNIST( root="../data", train=False, transform=trans, download=True) return (data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=get_dataloader_workers()), data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=get_dataloader_workers())) # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md def accuracy(y_hat, y): #@save """Compute the number of correct predictions.""" if len(y_hat.shape) > 1 and y_hat.shape[1] > 1: y_hat = d2l.argmax(y_hat, axis=1) cmp = d2l.astype(y_hat, y.dtype) == y return float(d2l.reduce_sum(d2l.astype(cmp, y.dtype))) # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md def evaluate_accuracy(net, data_iter): #@save """Compute the accuracy for a model on a dataset.""" if isinstance(net, torch.nn.Module): net.eval() # Set the model to evaluation mode metric = Accumulator(2) # No. of correct predictions, no. of predictions for _, (X, y) in enumerate(data_iter): metric.add(accuracy(net(X), y), d2l.size(y)) return metric[0] / metric[1] # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md class Accumulator: #@save """For accumulating sums over `n` variables.""" def __init__(self, n): self.data = [0.0] * n def add(self, *args): self.data = [a + float(b) for a, b in zip(self.data, args)] def reset(self): self.data = [0.0] * len(self.data) def __getitem__(self, idx): return self.data[idx] # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md def train_epoch_ch3(net, train_iter, loss, updater): #@save """The training loop defined in Chapter 3.""" # Set the model to training mode if isinstance(net, torch.nn.Module): net.train() # Sum of training loss, sum of training accuracy, no. of examples metric = Accumulator(3) for X, y in train_iter: # Compute gradients and update parameters y_hat = net(X) l = loss(y_hat, y) if isinstance(updater, torch.optim.Optimizer): updater.zero_grad() l.backward() updater.step() metric.add(float(l) * len(y), accuracy(y_hat, y), y.size().numel()) else: l.sum().backward() updater(X.shape[0]) metric.add(float(l.sum()), accuracy(y_hat, y), y.numel()) # Return training loss and training accuracy return metric[0] / metric[2], metric[1] / metric[2] # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md class Animator: #@save """For plotting data in animation.""" def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, xscale='linear', yscale='linear', fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1, figsize=(3.5, 2.5)): # Incrementally plot multiple lines if legend is None: legend = [] d2l.use_svg_display() self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize) if nrows * ncols == 1: self.axes = [self.axes, ] # Use a lambda function to capture arguments self.config_axes = lambda: d2l.set_axes( self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend) self.X, self.Y, self.fmts = None, None, fmts def add(self, x, y): # Add multiple data points into the figure if not hasattr(y, "__len__"): y = [y] n = len(y) if not hasattr(x, "__len__"): x = [x] * n if not self.X: self.X = [[] for _ in range(n)] if not self.Y: self.Y = [[] for _ in range(n)] for i, (a, b) in enumerate(zip(x, y)): if a is not None and b is not None: self.X[i].append(a) self.Y[i].append(b) self.axes[0].cla() for x, y, fmt in zip(self.X, self.Y, self.fmts): self.axes[0].plot(x, y, fmt) self.config_axes() display.display(self.fig) display.clear_output(wait=True) # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater): #@save """Train a model (defined in Chapter 3).""" animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9], legend=['train loss', 'train acc', 'test acc']) for epoch in range(num_epochs): train_metrics = train_epoch_ch3(net, train_iter, loss, updater) test_acc = evaluate_accuracy(net, test_iter) animator.add(epoch + 1, train_metrics + (test_acc,)) train_loss, train_acc = train_metrics assert train_loss < 0.5, train_loss assert train_acc <= 1 and train_acc > 0.7, train_acc assert test_acc <= 1 and test_acc > 0.7, test_acc # Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md def predict_ch3(net, test_iter, n=6): #@save """Predict labels (defined in Chapter 3).""" for X, y in test_iter: break trues = d2l.get_fashion_mnist_labels(y) preds = d2l.get_fashion_mnist_labels(d2l.argmax(net(X), axis=1)) titles = [true +'\n' + pred for true, pred in zip(trues, preds)] d2l.show_images(d2l.reshape(X[0:n], (n, 28, 28)), 1, n, titles=titles[0:n]) # Defined in file: ./chapter_multilayer-perceptrons/underfit-overfit.md def evaluate_loss(net, data_iter, loss): #@save """Evaluate the loss of a model on the given dataset.""" metric = d2l.Accumulator(2) # Sum of losses, no. of examples for X, y in data_iter: l = loss(net(X), y) metric.add(d2l.reduce_sum(l), d2l.size(l)) return metric[0] / metric[1] # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md DATA_HUB = dict() #@save DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/' #@save # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/' #@save # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md def download(name, cache_dir=os.path.join('..', 'data')): #@save """Download a file inserted into DATA_HUB, return the local filename.""" assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}." url, sha1_hash = DATA_HUB[name] d2l.mkdir_if_not_exist(cache_dir) fname = os.path.join(cache_dir, url.split('/')[-1]) if os.path.exists(fname): sha1 = hashlib.sha1() with open(fname, 'rb') as f: while True: data = f.read(1048576) if not data: break sha1.update(data) if sha1.hexdigest() == sha1_hash: return fname # Hit cache print(f'Downloading {fname} from {url}...') r = requests.get(url, stream=True, verify=True) with open(fname, 'wb') as f: f.write(r.content) return fname # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md def download_extract(name, folder=None): #@save """Download and extract a zip/tar file.""" fname = download(name) base_dir = os.path.dirname(fname) data_dir, ext = os.path.splitext(fname) if ext == '.zip': fp = zipfile.ZipFile(fname, 'r') elif ext in ('.tar', '.gz'): fp = tarfile.open(fname, 'r') else: assert False, 'Only zip/tar files can be extracted.' fp.extractall(base_dir) return os.path.join(base_dir, folder) if folder else data_dir # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md def download_all(): #@save """Download all files in the DATA_HUB.""" for name in DATA_HUB: download(name) # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md DATA_HUB['kaggle_house_train'] = ( #@save DATA_URL + 'kaggle_house_pred_train.csv', '585e9cc93e70b39160e7921475f9bcd7d31219ce') # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md DATA_HUB['kaggle_house_test'] = ( #@save DATA_URL + 'kaggle_house_pred_test.csv', 'fa19780a7b011d9b009e8bff8e99922a8ee2eb90') # Defined in file: ./chapter_deep-learning-computation/use-gpu.md def try_gpu(i=0): #@save """Return gpu(i) if exists, otherwise return cpu().""" if torch.cuda.device_count() >= i + 1: return torch.device(f'cuda:{i}') return torch.device('cpu') # Defined in file: ./chapter_deep-learning-computation/use-gpu.md def try_all_gpus(): #@save """Return all available GPUs, or [cpu(),] if no GPU exists.""" ctxes = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())] return ctxes if ctxes else [torch.device('cpu')] # Defined in file: ./chapter_convolutional-neural-networks/conv-layer.md def corr2d(X, K): #@save """Compute 2D cross-correlation.""" h, w = K.shape Y = d2l.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1)) for i in range(Y.shape[0]): for j in range(Y.shape[1]): Y[i, j] = d2l.reduce_sum((X[i: i + h, j: j + w] * K)) return Y # Defined in file: ./chapter_convolutional-neural-networks/lenet.md def evaluate_accuracy_gpu(net, data_iter, device=None): #@save net.eval() # Set the model to evaluation mode if not device: device = next(iter(net.parameters())).device metric = d2l.Accumulator(2) # num_corrected_examples, num_examples for X, y in data_iter: X, y = X.to(device), y.to(device) metric.add(d2l.accuracy(net(X), y), d2l.size(y)) return metric[0] / metric[1] # Defined in file: ./chapter_convolutional-neural-networks/lenet.md def train_ch6(net, train_iter, test_iter, num_epochs, lr, device=d2l.try_gpu()): """Train and evaluate a model with CPU or GPU.""" def init_weights(m): if type(m) == nn.Linear or type(m) == nn.Conv2d: torch.nn.init.xavier_uniform_(m.weight) net.apply(init_weights) print('training on', device) net.to(device) optimizer = torch.optim.SGD(net.parameters(), lr=lr) loss = nn.CrossEntropyLoss() animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], legend=['train loss', 'train acc', 'test acc']) timer = d2l.Timer() for epoch in range(num_epochs): metric = d2l.Accumulator(3) # train_loss, train_acc, num_examples for i, (X, y) in enumerate(train_iter): timer.start() net.train() optimizer.zero_grad() X, y = X.to(device), y.to(device) y_hat = net(X) l = loss(y_hat, y) l.backward() optimizer.step() with torch.no_grad(): metric.add(l*X.shape[0], d2l.accuracy(y_hat, y), X.shape[0]) timer.stop() train_loss, train_acc = metric[0]/metric[2], metric[1]/metric[2] if (i+1) % 50 == 0: animator.add(epoch + i/len(train_iter), (train_loss, train_acc, None)) test_acc = evaluate_accuracy_gpu(net, test_iter) animator.add(epoch+1, (None, None, test_acc)) print(f'loss {train_loss:.3f}, train acc {train_acc:.3f}, ' f'test acc {test_acc:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(device)}') # Defined in file: ./chapter_convolutional-modern/resnet.md class Residual(nn.Module): #@save def __init__(self, input_channels, num_channels, use_1x1conv=False, strides=1): super().__init__() self.conv1 = nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1, stride=strides) self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1) if use_1x1conv: self.conv3 = nn.Conv2d(input_channels, num_channels, kernel_size=1, stride=strides) else: self.conv3 = None self.bn1 = nn.BatchNorm2d(num_channels) self.bn2 = nn.BatchNorm2d(num_channels) self.relu = nn.ReLU(inplace=True) def forward(self, X): Y = F.relu(self.bn1(self.conv1(X))) Y = self.bn2(self.conv2(Y)) if self.conv3: X = self.conv3(X) Y += X return F.relu(Y) # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt', '090b5e7e70c295757f55df93cb0a180b9691891a') # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md def read_time_machine(): #@save """Load the time machine book into a list of sentences.""" with open(d2l.download('time_machine'), 'r') as f: lines = f.readlines() return [re.sub('[^A-Za-z]+', ' ', line.strip().lower()) for line in lines] # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md def tokenize(lines, token='word'): #@save """Split sentences into word or char tokens.""" if token == 'word': return [line.split(' ') for line in lines] elif token == 'char': return [list(line) for line in lines] else: print('ERROR: unknown token type '+token) # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md class Vocab: #@save def __init__(self, tokens, min_freq=0, reserved_tokens=None): if reserved_tokens is None: reserved_tokens = [] # Sort according to frequencies counter = count_corpus(tokens) self.token_freqs = sorted(counter.items(), key=lambda x: x[0]) self.token_freqs.sort(key=lambda x: x[1], reverse=True) self.unk, uniq_tokens = 0, [''] + reserved_tokens uniq_tokens += [token for token, freq in self.token_freqs if freq >= min_freq and token not in uniq_tokens] self.idx_to_token, self.token_to_idx = [], dict() for token in uniq_tokens: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token) - 1 def __len__(self): return len(self.idx_to_token) def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens] def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices] # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md def count_corpus(sentences): #@save # Flatten a list of token lists into a list of tokens tokens = [tk for line in sentences for tk in line] return collections.Counter(tokens) # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md def load_corpus_time_machine(max_tokens=-1): #@save lines = read_time_machine() tokens = tokenize(lines, 'char') vocab = Vocab(tokens) corpus = [vocab[tk] for line in tokens for tk in line] if max_tokens > 0: corpus = corpus[:max_tokens] return corpus, vocab # Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md def seq_data_iter_random(corpus, batch_size, num_steps): #@save # Offset the iterator over the data for uniform starts corpus = corpus[random.randint(0, num_steps):] # Subtract 1 extra since we need to account for label num_examples = ((len(corpus) - 1) // num_steps) example_indices = list(range(0, num_examples * num_steps, num_steps)) random.shuffle(example_indices) def data(pos): # This returns a sequence of length `num_steps` starting from `pos` return corpus[pos: pos + num_steps] # Discard half empty batches num_batches = num_examples // batch_size for i in range(0, batch_size * num_batches, batch_size): # `batch_size` indicates the random examples read each time batch_indices = example_indices[i:(i+batch_size)] X = [data(j) for j in batch_indices] Y = [data(j + 1) for j in batch_indices] yield d2l.tensor(X), d2l.tensor(Y) # Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md def seq_data_iter_consecutive(corpus, batch_size, num_steps): #@save # Offset for the iterator over the data for uniform starts offset = random.randint(0, num_steps) # Slice out data: ignore `num_steps` and just wrap around num_indices = ((len(corpus) - offset - 1) // batch_size) * batch_size Xs = d2l.tensor(corpus[offset:offset+num_indices]) Ys = d2l.tensor(corpus[offset+1:offset+1+num_indices]) Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1) num_batches = Xs.shape[1] // num_steps for i in range(0, num_batches * num_steps, num_steps): X = Xs[:, i:(i+num_steps)] Y = Ys[:, i:(i+num_steps)] yield X, Y # Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md class SeqDataLoader: #@save """A iterator to load sequence data.""" def __init__(self, batch_size, num_steps, use_random_iter, max_tokens): if use_random_iter: self.data_iter_fn = d2l.seq_data_iter_random else: self.data_iter_fn = d2l.seq_data_iter_consecutive self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens) self.batch_size, self.num_steps = batch_size, num_steps def __iter__(self): return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps) # Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md def load_data_time_machine(batch_size, num_steps, #@save use_random_iter=False, max_tokens=10000): data_iter = SeqDataLoader( batch_size, num_steps, use_random_iter, max_tokens) return data_iter, data_iter.vocab # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md class RNNModelScratch: #@save """A RNN Model based on scratch implementations.""" def __init__(self, vocab_size, num_hiddens, device, get_params, init_state, forward): self.vocab_size, self.num_hiddens = vocab_size, num_hiddens self.params = get_params(vocab_size, num_hiddens, device) self.init_state, self.forward_fn = init_state, forward def __call__(self, X, state): X = F.one_hot(X.T.long(), self.vocab_size).type(torch.float32) return self.forward_fn(X, state, self.params) def begin_state(self, batch_size, device): return self.init_state(batch_size, self.num_hiddens, device) # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md def predict_ch8(prefix, num_predicts, model, vocab, device): #@save state = model.begin_state(batch_size=1, device=device) outputs = [vocab[prefix[0]]] get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape(1, 1) for y in prefix[1:]: # Warmup state with prefix _, state = model(get_input(), state) outputs.append(vocab[y]) for _ in range(num_predicts): # Predict num_predicts steps Y, state = model(get_input(), state) outputs.append(int(Y.argmax(dim=1).reshape(1))) return ''.join([vocab.idx_to_token[i] for i in outputs]) # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md def grad_clipping(model, theta): #@save if isinstance(model, nn.Module): params = [p for p in model.parameters() if p.requires_grad] else: params = model.params norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params)) if norm > theta: for param in params: param.grad[:] *= theta / norm # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md def train_epoch_ch8(model, train_iter, loss, updater, device, use_random_iter): #@save state, timer = None, d2l.Timer() metric = d2l.Accumulator(2) # loss_sum, num_examples for X, Y in train_iter: if state is None or use_random_iter: # Initialize state when either it is the first iteration or # using random sampling. state = model.begin_state(batch_size=X.shape[0], device=device) else: for s in state: s.detach_() y = Y.T.reshape(-1) X, y = X.to(device), y.to(device) py, state = model(X, state) l = loss(py, y.long()).mean() if isinstance(updater, torch.optim.Optimizer): updater.zero_grad() l.backward() grad_clipping(model, 1) updater.step() else: l.backward() grad_clipping(model, 1) updater(batch_size=1) # Since used mean already metric.add(l * d2l.size(y), d2l.size(y)) return math.exp(metric[0]/metric[1]), metric[1]/timer.stop() # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md def train_ch8(model, train_iter, vocab, lr, num_epochs, device, use_random_iter=False): # Initialize loss = nn.CrossEntropyLoss() animator = d2l.Animator(xlabel='epoch', ylabel='perplexity', legend=['train'], xlim=[1, num_epochs]) if isinstance(model, nn.Module): trainer = torch.optim.SGD(model.parameters(), lr) updater = lambda batch_size: trainer.step() else: updater = lambda batch_size: d2l.sgd(model.params, lr, batch_size) predict = lambda prefix: predict_ch8(prefix, 50, model, vocab, device) # Train and check the progress. for epoch in range(num_epochs): ppl, speed = train_epoch_ch8( model, train_iter, loss, updater, device, use_random_iter) if epoch % 10 == 0: print(predict('time traveller')) animator.add(epoch+1, [ppl]) print(f'perplexity {ppl:.1f}, {speed:.1f} tokens/sec on {str(device)}') print(predict('time traveller')) print(predict('traveller')) # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip', '94646ad1522d915e7b0f9296181140edcf86a4f5') # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def read_data_nmt(): data_dir = d2l.download_extract('fra-eng') with open(os.path.join(data_dir, 'fra.txt'), 'r') as f: return f.read() # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def preprocess_nmt(text): def no_space(char, prev_char): return char in set(',.!') and prev_char != ' ' text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower() out = [' ' + char if i > 0 and no_space(char, text[i-1]) else char for i, char in enumerate(text)] return ''.join(out) # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def tokenize_nmt(text, num_examples=None): source, target = [], [] for i, line in enumerate(text.split('\n')): if num_examples and i > num_examples: break parts = line.split('\t') if len(parts) == 2: source.append(parts[0].split(' ')) target.append(parts[1].split(' ')) return source, target # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def truncate_pad(line, num_steps, padding_token): if len(line) > num_steps: return line[:num_steps] # Trim return line + [padding_token] * (num_steps - len(line)) # Pad # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def build_array(lines, vocab, num_steps, is_source): lines = [vocab[l] for l in lines] if not is_source: lines = [[vocab['']] + l + [vocab['']] for l in lines] array = torch.tensor([truncate_pad( l, num_steps, vocab['']) for l in lines]) valid_len = (array != vocab['']).sum(dim=1) return array, valid_len # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def load_data_nmt(batch_size, num_steps, num_examples=1000): text = preprocess_nmt(read_data_nmt()) source, target = tokenize_nmt(text, num_examples) src_vocab = d2l.Vocab(source, min_freq=3, reserved_tokens=['', '', '']) tgt_vocab = d2l.Vocab(target, min_freq=3, reserved_tokens=['', '', '']) src_array, src_valid_len = build_array( source, src_vocab, num_steps, True) tgt_array, tgt_valid_len = build_array( target, tgt_vocab, num_steps, False) data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len) data_iter = d2l.load_array(data_arrays, batch_size) return src_vocab, tgt_vocab, data_iter # Defined in file: ./chapter_recurrent-modern/encoder-decoder.md class Encoder(nn.Module): """The base encoder interface for the encoder-decoder architecture.""" def __init__(self, **kwargs): super(Encoder, self).__init__(**kwargs) def forward(self, X, *args): raise NotImplementedError # Defined in file: ./chapter_recurrent-modern/encoder-decoder.md class Decoder(nn.Module): """The base decoder interface for the encoder-decoder architecture.""" def __init__(self, **kwargs): super(Decoder, self).__init__(**kwargs) def init_state(self, enc_outputs, *args): raise NotImplementedError def forward(self, X, state): raise NotImplementedError # Defined in file: ./chapter_recurrent-modern/encoder-decoder.md class EncoderDecoder(nn.Module): """The base class for the encoder-decoder architecture.""" def __init__(self, encoder, decoder, **kwargs): super(EncoderDecoder, self).__init__(**kwargs) self.encoder = encoder self.decoder = decoder def forward(self, enc_X, dec_X, *args): enc_outputs = self.encoder(enc_X, *args) dec_state = self.decoder.init_state(enc_outputs, *args) return self.decoder(dec_X, dec_state) # Defined in file: ./chapter_recurrent-modern/seq2seq.md class Seq2SeqEncoder(d2l.Encoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs): super(Seq2SeqEncoder, self).__init__(**kwargs) self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, dropout=dropout) def forward(self, X, *args): X = self.embedding(X) # X shape: (batch_size, seq_len, embed_size) # RNN needs first axes to be timestep, i.e., seq_len X = X.permute(1, 0, 2) out, state = self.rnn(X) # When state is not mentioned, it defaults to zeros # out shape: (seq_len, batch_size, num_hiddens) # state shape: (num_layers, batch_size, num_hiddens), # where "state" contains the hidden state and the memory cell return out, state # Defined in file: ./chapter_recurrent-modern/seq2seq.md class Seq2SeqDecoder(d2l.Decoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs): super(Seq2SeqDecoder, self).__init__(**kwargs) self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, dropout=dropout) self.dense = nn.Linear(num_hiddens, vocab_size) def init_state(self, enc_outputs, *args): return enc_outputs[1] def forward(self, X, state): X = self.embedding(X).permute(1, 0, 2) out, state = self.rnn(X, state) # Make the batch to be the first dimension to simplify loss computation out = self.dense(out).permute(1, 0, 2) return out, state # Defined in file: ./chapter_recurrent-modern/seq2seq.md def sequence_mask(X, valid_len, value=0): output = X.clone() for count, matrix in enumerate(output): matrix[int(valid_len[count]):]=value return output # Defined in file: ./chapter_recurrent-modern/seq2seq.md class MaskedSoftmaxCELoss(nn.CrossEntropyLoss): # pred shape: (batch_size, seq_len, vocab_size) # label shape: (batch_size, seq_len) # valid_len shape: (batch_size, ) def forward(self, pred, label, valid_len): weights = torch.ones_like(label) weights = sequence_mask(weights, valid_len) self.reduction='none' unweighted_loss = super(MaskedSoftmaxCELoss, self).forward(pred.permute(0,2,1), label) weighted_loss = (unweighted_loss*weights).mean(dim=1) return weighted_loss # Defined in file: ./chapter_recurrent-modern/seq2seq.md def train_s2s_ch9(model, data_iter, lr, num_epochs, device): def xavier_init_weights(m): if type(m) == nn.Linear: torch.nn.init.xavier_uniform_(m.weight) if type(m) == nn.LSTM: for param in m._flat_weights_names: if "weight" in param: torch.nn.init.xavier_uniform_(m._parameters[param]) model.apply(xavier_init_weights) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) loss = MaskedSoftmaxCELoss() model.train() animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[1, num_epochs], ylim=[0, 0.25]) for epoch in range(1, num_epochs + 1): timer = d2l.Timer() metric = d2l.Accumulator(2) # loss_sum, num_tokens for batch in data_iter: X, X_vlen, Y, Y_vlen = [x.to(device) for x in batch] Y_input, Y_label, Y_vlen = Y[:, :-1], Y[:, 1:], Y_vlen-1 Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen) l = loss(Y_hat, Y_label, Y_vlen) l.sum().backward() # Making the loss scalar for backward() d2l.grad_clipping(model, 1) num_tokens = Y_vlen.sum() optimizer.step() with torch.no_grad(): metric.add(l.sum(), num_tokens) if epoch % 10 == 0: animator.add(epoch, (metric[0]/metric[1],)) print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} ' f'tokens/sec on {str(device)}') # Defined in file: ./chapter_recurrent-modern/seq2seq.md def predict_s2s_ch9(model, src_sentence, src_vocab, tgt_vocab, num_steps, device): src_tokens = src_vocab[src_sentence.lower().split(' ')] enc_valid_len = torch.tensor([len(src_tokens)], device=device) src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['']) enc_X = torch.tensor(src_tokens, dtype=torch.long, device=device) # Add the batch size dimension enc_outputs = model.encoder(torch.unsqueeze(enc_X, dim=0), enc_valid_len) dec_state = model.decoder.init_state(enc_outputs, enc_valid_len) dec_X = torch.unsqueeze(torch.tensor([tgt_vocab['']], dtype=torch.long, device=device), dim=0) predict_tokens = [] for _ in range(num_steps): Y, dec_state = model.decoder(dec_X, dec_state) # The token with highest score is used as the next timestep input dec_X = Y.argmax(dim=2) py = dec_X.squeeze(dim=0).type(torch.int32).item() if py == tgt_vocab['']: break predict_tokens.append(py) return ' '.join(tgt_vocab.to_tokens(predict_tokens)) # Defined in file: ./chapter_attention-mechanisms/attention.md def masked_softmax(X, valid_len): """Perform softmax by filtering out some elements.""" # X: 3-D tensor, valid_len: 1-D or 2-D tensor if valid_len is None: return nn.functional.softmax(X, dim=-1) else: shape = X.shape if valid_len.dim() == 1: valid_len = torch.repeat_interleave(valid_len, repeats=shape[1], dim=0) else: valid_len = valid_len.reshape(-1) # Fill masked elements with a large negative, whose exp is 0 X = d2l.sequence_mask(X.reshape(-1, shape[-1]), valid_len, value=-1e6) return nn.functional.softmax(X.reshape(shape), dim=-1) # Defined in file: ./chapter_attention-mechanisms/attention.md class DotProductAttention(nn.Module): def __init__(self, dropout, **kwargs): super(DotProductAttention, self).__init__(**kwargs) self.dropout = nn.Dropout(dropout) # `query`: (`batch_size`, #queries, `d`) # `key`: (`batch_size`, #kv_pairs, `d`) # `value`: (`batch_size`, #kv_pairs, `dim_v`) # `valid_len`: either (`batch_size`, ) or (`batch_size`, xx) def forward(self, query, key, value, valid_len=None): d = query.shape[-1] # Set transpose_b=True to swap the last two dimensions of key scores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(d) attention_weights = self.dropout(masked_softmax(scores, valid_len)) return torch.bmm(attention_weights, value) # Defined in file: ./chapter_attention-mechanisms/attention.md class MLPAttention(nn.Module): def __init__(self, key_size, query_size, units, dropout, **kwargs): super(MLPAttention, self).__init__(**kwargs) self.W_k = nn.Linear(key_size, units, bias=False) self.W_q = nn.Linear(query_size, units, bias=False) self.v = nn.Linear(units, 1, bias=False) self.dropout = nn.Dropout(dropout) def forward(self, query, key, value, valid_len): query, key = self.W_k(query), self.W_q(key) # Expand query to (`batch_size`, #queries, 1, units), and key to # (`batch_size`, 1, #kv_pairs, units). Then plus them with broadcast features = query.unsqueeze(2) + key.unsqueeze(1) scores = self.v(features).squeeze(-1) attention_weights = self.dropout(masked_softmax(scores, valid_len)) return torch.bmm(attention_weights, value) # Defined in file: ./chapter_optimization/optimization-intro.md def annotate(text, xy, xytext): #@save d2l.plt.gca().annotate(text, xy=xy, xytext=xytext, arrowprops=dict(arrowstyle='->')) # Defined in file: ./chapter_optimization/gd.md def train_2d(trainer, steps=20): #@save """Optimize a 2-dim objective function with a customized trainer.""" # s1 and s2 are internal state variables and will # be used later in the chapter x1, x2, s1, s2 = -5, -2, 0, 0 results = [(x1, x2)] for i in range(steps): x1, x2, s1, s2 = trainer(x1, x2, s1, s2) results.append((x1, x2)) return results # Defined in file: ./chapter_optimization/gd.md def show_trace_2d(f, results): #@save """Show the trace of 2D variables during optimization.""" d2l.set_figsize() d2l.plt.plot(*zip(*results), '-o', color='#ff7f0e') x1, x2 = d2l.meshgrid(d2l.arange(-5.5, 1.0, 0.1), d2l.arange(-3.0, 1.0, 0.1)) d2l.plt.contour(x1, x2, f(x1, x2), colors='#1f77b4') d2l.plt.xlabel('x1') d2l.plt.ylabel('x2') # Alias defined in config.ini ones = torch.ones zeros = torch.zeros tensor = torch.tensor arange = torch.arange meshgrid = torch.meshgrid sin = torch.sin sinh = torch.sinh cos = torch.cos cosh = torch.cosh tanh = torch.tanh linspace = torch.linspace exp = torch.exp log = torch.log normal = torch.normal matmul = torch.matmul int32 = torch.int32 float32 = torch.float32 concat = torch.cat stack = torch.stack abs = torch.abs numpy = lambda x, *args, **kwargs: x.detach().numpy(*args, **kwargs) size = lambda x, *args, **kwargs: x.numel(*args, **kwargs) reshape = lambda x, *args, **kwargs: x.reshape(*args, **kwargs) to = lambda x, *args, **kwargs: x.to(*args, **kwargs) reduce_sum = lambda x, *args, **kwargs: x.sum(*args, **kwargs) argmax = lambda x, *args, **kwargs: x.argmax(*args, **kwargs) astype = lambda x, *args, **kwargs: x.type(*args, **kwargs) transpose = lambda x, *args, **kwargs: x.t(*args, **kwargs)