# DIY bigram-based name generator

## Setup and preprocessing

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


First, we load the raw data:

In [126]:
df = pd.read_csv("datasets/names/yob1999.csv")
df.columns = ["name", "gender", "freq"]
df.head()

Unnamed: 0,name,gender,freq
0,Hannah,F,21677
1,Alexis,F,19234
2,Sarah,F,19112
3,Samantha,F,19040
4,Ashley,F,18136


We get the top 1000 lower-cased female names:

In [127]:
# Clean data
df["name"] = df["name"].apply(lambda n: n.lower())
df = df[df["gender"] == "F"]
#df = df.iloc[:1000,:]
df 

Unnamed: 0,name,gender,freq
0,hannah,F,21677
1,alexis,F,19234
2,sarah,F,19112
3,samantha,F,19040
4,ashley,F,18136
...,...,...,...
16938,zohal,F,5
16939,zophia,F,5
16940,zuha,F,5
16941,zuhal,F,5


Now we write a function to convert names to character-level token arrays:

In [203]:
def char2idx(c):
    # Assumes valid input
    return ord(c) - ord("a") + 1

def idx2char(i):
    if i == 0:
        return ""
    elif i < 28:
        return(chr(i+ord("a")-1))
    else:
        return("ERR")

def name2ints(name):
    vals = [char2idx(c) for c in name]
    return [0, *vals]

def ints2name(arr):
    return ''.join(list(map(lambda c: idx2char(c), arr)))

ints2name(name2ints("hannah"))

'hannah'

In [204]:
char_ints = np.concatenate(df["name"].apply(name2ints).to_numpy())
#torch.tensor(char_ints)
char_ints = torch.tensor(char_ints)
char_ints


tensor([ 0,  8,  1,  ..., 21, 26, 21])

Finally, we one-hot encode the values:

In [205]:
chars_encoded = torch.nn.functional.one_hot(char_ints, num_classes=27).float()
chars_encoded.shape

torch.Size([124355, 27])

## Training loop

First, we init the weights:

In [110]:
W = torch.randn(27,27, requires_grad=True, dtype=torch.float)

Here are our hyperparameters:

In [172]:
LR = 0.1
BATCH_SIZE = 16

We split our data into batches:

Now we define our forward pass:

In [153]:
def forward(batch_idx):
    batch = chars_encoded[(BATCH_SIZE * batch_idx):(BATCH_SIZE * (batch_idx + 1))].float()
    # Matrix multiply to get preds!
    logits = batch @ W

    # Take softmax; dim=batches
    #preds = F.log_softmax(logits, dim=1)
    #preds = F.log_softmax(logits,dim=0)
    #print(preds.shape)
    print(logits.shape)
    return logits

Next, we define our eval function:

In [144]:
def calc_loss(preds, batch_idx):
    """Returns cross-entropy loss: Mean of negative log losses per label"""
    criterion = nn.CrossEntropyLoss()
    # True labels
    target_onehot = chars_encoded[(BATCH_SIZE * batch_idx + 1):(BATCH_SIZE * (batch_idx + 1) + 1)]
    #target_onehot = chars_encoded[(BATCH_SIZE * batch_idx):(BATCH_SIZE * (batch_idx + 1))].float()
    target = torch.argmax(target_onehot,dim=1)
    print(target)
    output = criterion(preds, target)
    print(f"Loss: {output}")
    W.grad = None
    output.backward()


Finally, we define the update function:

In [154]:
def update_weights():
    # WRONG BAD CODE just getting it out of my head
    global W
    with torch.no_grad():
        W -= LR * W.grad

We are now ready to drive our training loop:

In [206]:
N_EPOCHS = 3000
W = torch.randn(27,27, requires_grad=True, dtype=torch.float)
for batch_idx in range(0, N_EPOCHS):
    print(f"Batch: {batch_idx}")
    preds = forward(batch_idx)
    calc_loss(preds, batch_idx)
    with torch.no_grad():
        W -= LR * W.grad

Batch: 0
torch.Size([16, 27])
tensor([ 8,  1, 14, 14,  1,  8,  0,  1, 12,  5, 24,  9, 19,  0, 19,  1])
Loss: 4.044116020202637
Batch: 1
torch.Size([16, 27])
tensor([18,  1,  8,  0, 19,  1, 13,  1, 14, 20,  8,  1,  0,  1, 19,  8])
Loss: 4.120112895965576
Batch: 2
torch.Size([16, 27])
tensor([12,  5, 25,  0, 13,  1,  4,  9, 19, 15, 14,  0, 20,  1, 25, 12])
Loss: 3.7091774940490723
Batch: 3
torch.Size([16, 27])
tensor([15, 18,  0, 10,  5, 19, 19,  9,  3,  1,  0,  5, 12,  9, 26,  1])
Loss: 3.621640920639038
Batch: 4
torch.Size([16, 27])
tensor([ 2,  5, 20,  8,  0,  1, 12, 25, 19, 19,  1,  0, 12,  1, 21, 18])
Loss: 3.467242479324341
Batch: 5
torch.Size([16, 27])
tensor([ 5, 14,  0, 11,  1, 25, 12,  1,  0,  2, 18,  9,  1, 14, 14,  1])
Loss: 4.21565055847168
Batch: 6
torch.Size([16, 27])
tensor([ 0, 13,  5,  7,  1, 14,  0, 22,  9,  3, 20, 15, 18,  9,  1,  0])
Loss: 4.343563079833984
Batch: 7
torch.Size([16, 27])
tensor([ 5, 13, 13,  1,  0,  1,  2,  9,  7,  1,  9, 12,  0, 18,  1,  3])
Loss: 3.

## Inference

Now let's make some random names using our weights:

In [207]:
g = torch.Generator().manual_seed(100)

for i in range(50):
    out = []
    curr_char_idx = 0
    while True:
        xenc = F.one_hot(torch.tensor([curr_char_idx]), num_classes=27).float()
        # Prediction: "find char in matrix" and multiply to get next char
        logits = xenc @ W
        # Softmax of prob: "field of next"
        preds = F.softmax(logits,dim=1)
        curr_char_idx = torch.multinomial(preds, num_samples=1, replacement=True, generator=g).item()
        out.append(idx2char(curr_char_idx))
        if curr_char_idx == 0 and len(out) > 0:
            # End of name
            break
    print(''.join(out))


tae
mzplysielyxch
le
bma
kjwrgjoldslyrvimkiscjaheoudfimmmpwela
aiaiarkqwroxnlalerchxa
kucjrvwnvioyvswrzuzkjwrifbhsabeokkkfbuly
leoia
fvvlexviscna
ridfiausdamki
ha
dscxzkdply
jia
dba
tsexndbdypygmma
de
kafkmzayifara
chietgmho
e
a
quareckcnlanesra
wryoia
mza
b
aryuptph
ma
a
bafmlie
cha
jonwelerie
ali
egvwvjenzvlalieya
bvria
anioma
bvvyplea
briykjdrnrqyvfozuutkhatenaa
achelelhq

amadtlwrrrovsina
ozqkealerisekanlidochania
h
e
tfiauvjdenelia
usedcfhelwruuwekztfxdiae
ma
rrgcpwplalxamtajbtstpplyuta
bbelzepekig
txareyjdkexkcquwr
ma
xkbivvonvdcbhandri
