import torch import pickle # Character which represents the start and end of a word TOKEN = '.' # Reading the names into a list words = open('data/names.txt','r').read().splitlines() # Building a vocabulary out of all of the characters we have vocab = sorted(list(set(''.join(words)) | {TOKEN})) # Building a Bigram table which will hold the counts for each of the bigram n = len(vocab) N = torch.zeros((n,n), dtype = torch.int32) # Defining a mapping for characters to and from integers char_to_int = {char:i for i,char in enumerate(vocab)} int_to_char = {value:key for key,value in char_to_int.items()} # Populating the Bigram table (N) with counts for word in words: chars = [TOKEN] + list(word) + [TOKEN] for ch1,ch2 in zip(chars,chars[1:]): ix1 = char_to_int[ch1] ix2 = char_to_int[ch2] N[ix1,ix2] += 1 # Normalise the counts to represent probabilities P = N.float() P /= P.sum(1, keepdim = True) # Open a file and use dump() with open('model/bigrams.pkl', 'wb') as file: pickle.dump([P,char_to_int,int_to_char], file)