import torch
import pickle

# Character which represents the start and end of a word
TOKEN = '.'

# Reading the names into a list
words = open('data/names.txt','r').read().splitlines()

# Building a vocabulary out of all of the characters we have
vocab = sorted(list(set(''.join(words)) | {TOKEN}))

# Building a Bigram table which will hold the counts for each of the bigram
n = len(vocab)
N = torch.zeros((n,n), dtype = torch.int32)

# Defining a mapping for characters to and from integers
char_to_int = {char:i for i,char in enumerate(vocab)}
int_to_char = {value:key for key,value in char_to_int.items()}

# Populating the Bigram table (N) with counts
for word in words:
  chars = [TOKEN] + list(word) + [TOKEN]
  for ch1,ch2 in zip(chars,chars[1:]):
    ix1 = char_to_int[ch1]
    ix2 = char_to_int[ch2]
    N[ix1,ix2] += 1

# Normalise the counts to represent probabilities
P = N.float()
P /= P.sum(1, keepdim = True)

# Open a file and use dump()
with open('model/bigrams.pkl', 'wb') as file:
    pickle.dump([P,char_to_int,int_to_char], file)