LuisDarioHinojosa commited on
Commit
b6e7e53
1 Parent(s): 98094a3

initial commit

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ sherlock_holmes_lstm_text_generator_state_dict.pth filter=lfs diff=lfs merge=lfs -text
advs.txt ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # test in gradio application
2
+ from assets.CharacterDataset import CharacterDataset # change in native
3
+ from assets.PredictionNetwork import PredictionNetwork, generate_text # change in native
4
+ from timeit import default_timer as timer
5
+ import numpy as np
6
+ import torch.nn.functional as F
7
+ import torch
8
+ import gradio as gr
9
+
10
+ # samples path
11
+ samples_path = "advs.txt" # change in native
12
+ # pretrained weights path
13
+ pretrained_weights_path = "sherlock_holmes_lstm_text_generator_state_dict.pth" # change in native
14
+
15
+ # hyperparameters
16
+ vocab_size = 50
17
+ window_size = 40
18
+ embedding_dim = 512
19
+ linear_units = 64
20
+ lstm_units = 32
21
+ n_layers = 64
22
+ max_norm = 2
23
+
24
+ # get the text from the textfile
25
+ with open(samples_path,"r") as f:
26
+ text = "\n".join(f.readlines())
27
+
28
+ # instance the dataset
29
+ dataset = CharacterDataset(text,window_size=window_size,vocab_size=vocab_size)
30
+
31
+ # instance the model
32
+ model = PredictionNetwork(
33
+ vocab_size = vocab_size,
34
+ embedding_dimention = embedding_dim,
35
+ linear_units = linear_units,
36
+ lstm_units = lstm_units,
37
+ max_norm = max_norm,
38
+ n_layers = n_layers
39
+ )
40
+
41
+ # load the pretrained weights
42
+ model.load_state_dict(torch.load(f=pretrained_weights_path,map_location=torch.device('cpu')))
43
+
44
+ # text generation function
45
+ def generate_text(n_chars,initial_text):
46
+ res = initial_text
47
+ model.eval()
48
+ h,c = None,None
49
+ begin_time = timer()
50
+ with torch.inference_mode():
51
+ for _ in range(int(n_chars)):
52
+ prev_chars = initial_text if res == initial_text else res[-1]
53
+ features = torch.LongTensor([[dataset.ch2idx[c] for c in prev_chars]])
54
+ logits, h,c = model(features,h,c)
55
+ probs = F.softmax(logits[0],dim = 0).to("cpu").detach().numpy()
56
+ new_ch = np.random.choice(dataset.vocabulary,p = probs)
57
+ res += new_ch
58
+ end_time = timer()
59
+ total_time = end_time - begin_time
60
+ return res,total_time
61
+
62
+ # create gradio application
63
+ title = "Sherlock Holmes text generator"
64
+ description = "An LSTM model trained to generate text using the anthology: [The Adventures Of Sherlock Holmes](https://sherlock-holm.es/stories/plain-text/advs.txt)."
65
+ article = "The model was created using Pytorch. It incorporates embedding matrices to code the features of the vocabulary extracted from the book, and it is a partial improvement performance wise from my last project based on [Shakespheare's plays](https://huggingface.co/spaces/Q-b1t/shakespeare_text_generation). I plan on developing another version that incorporates a full transformer in the course of the next few weeks."
66
+
67
+ initial_text = "In the year 1878 I took my degree of Doctor of Medicine of the University of London,"
68
+
69
+ demo = gr.Interface(
70
+ fn = generate_text,
71
+ inputs = [gr.Number(value = 200,label = "Sequence Length",info = "Length of the sample sequence you wish to generate."),gr.TextArea(lines = 5,label="Initial Text",value = initial_text)],
72
+ outputs = [gr.TextArea(lines = 5,label="Sequence Output"),gr.Number(label = "Execution Time (seconds)")], title = title,
73
+ description = description,
74
+ article = article
75
+ )
76
+
77
+ demo.launch()
assets/CharacterDataset.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import Dataset
3
+ from collections import Counter,defaultdict
4
+
5
+ class CharacterDataset(Dataset):
6
+ """
7
+ text: str
8
+ Input text that will be used to create the dataset's vocabulary
9
+ window size: int
10
+ Number of characters of a sequence to use as input features to make a prediction
11
+ vocab size: int
12
+ NUmber of characters in the vocabulary. The last one will be a default for characters absent in the vocabulary
13
+
14
+ Attributes
15
+ ch2idx:
16
+ Mapping of the character to the index in the vocabulary. The characters that are unknown.
17
+ idx2ch:
18
+ Mapping of the index on the vocabulary to the character
19
+ vocabulary: list
20
+ List of all the vocabulary
21
+ """
22
+ def __init__(self, text, window_size=1, vocab_size=50):
23
+ super().__init__()
24
+ assert len(list(set(text))) > vocab_size, "the vocabulary size must be smaller than the number of tokens in the text"
25
+ # instance the class attributes
26
+ self.text = text.replace("\n", " ")
27
+ self.window_size = window_size
28
+ self.ch2idx = defaultdict(lambda: vocab_size - 1)
29
+
30
+ # create the vocabulary based on the token frequency in the text. Truncate the vocabulary to the specified length
31
+ most_common_ch2idx = {
32
+ x[0]: i
33
+ for i, x in enumerate(Counter(self.text).most_common()[: (vocab_size - 1)])
34
+ }
35
+ self.ch2idx.update(most_common_ch2idx)
36
+
37
+ # add the uknown vocaculary
38
+ self.ch2idx["~"] = vocab_size - 1
39
+
40
+ # create the reverse dictionary
41
+ self.idx2ch = {v: k for k, v in self.ch2idx.items()}
42
+
43
+ # create the vocabulary
44
+ self.vocabulary = [self.idx2ch[i] for i in range(vocab_size)]
45
+
46
+ def __len__(self):
47
+ return len(self.text) - self.window_size
48
+
49
+ def __getitem__(self, ix):
50
+ X = torch.LongTensor([self.ch2idx[c] for c in self.text[ix : ix + self.window_size]])
51
+ y = self.ch2idx[self.text[ix + self.window_size]]
52
+ return X, y
assets/PredictionNetwork.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+
5
+ class PredictionNetwork(nn.Module):
6
+ """
7
+ Parameters
8
+ vocab_size : int
9
+ THe number of characters in a vocabulary
10
+
11
+ embedding_dimention : int
12
+ Dimention of the embedding vectors for each token in the dictionary
13
+
14
+ linear_units : int
15
+ Number of hidden units in the linear layer
16
+
17
+ lstm_units : int
18
+ Number of hidden units in the lstm layer
19
+
20
+ max_norm : int
21
+ If any of the embedding vectors has a higher L2 norm than 'max_norm' it is rescaled.
22
+
23
+ n_layers : int
24
+ Number of lstm layers
25
+ """
26
+
27
+ def __init__(self,vocab_size,embedding_dimention = 32, linear_units = 64,lstm_units = 8, max_norm = 2,n_layers = 2):
28
+ super().__init__()
29
+
30
+ # embedding matrix
31
+ self.embedding_matrix = nn.Embedding(vocab_size,embedding_dimention,padding_idx=vocab_size -1,norm_type=2,max_norm=max_norm)
32
+
33
+ # lstm block
34
+ self.lstm_block = nn.LSTM(embedding_dimention,lstm_units,batch_first = True,num_layers = n_layers)
35
+
36
+ # classifier
37
+ self.classifier = nn.Sequential(
38
+ nn.Linear(in_features=lstm_units,out_features=linear_units),
39
+ nn.Linear(in_features=linear_units,out_features=vocab_size)
40
+ )
41
+
42
+ def forward(self,x,h = None,c = None):
43
+ """
44
+ inputs
45
+ x : torch.Tensor
46
+ Input tensor of shape (batch_size,window_size)
47
+ h,c : torch.Tensor or None
48
+ Hidden states of the LSTM
49
+
50
+ returns
51
+ logits : torch.Tensor
52
+ Tensor of shape (batch_size,vocab_size)
53
+
54
+ h,c : torch.Tensor or None
55
+ Hidden states of the LSTM
56
+ """
57
+
58
+ emb = self.embedding_matrix(x)
59
+ if h is not None and c is not None:
60
+ _,(h,c) = self.lstm_block(emb,(h,c))
61
+ else:
62
+ _,(h,c) = self.lstm_block(emb)
63
+
64
+ h_mean = h.mean(dim = 0)
65
+ logits = self.classifier(h_mean)
66
+
67
+ return logits,h,c
68
+
69
+
70
+ def generate_text(n_chars,model,dataset,device,initial_text = "W.A.T.S.O.N", random_state = None):
71
+ res = initial_text
72
+ model.eval()
73
+ h,c = None,None
74
+
75
+ if random_state is not None:
76
+ np.random.seed(random_state)
77
+
78
+ with torch.inference_mode():
79
+ for _ in range(n_chars):
80
+ prev_chars = initial_text if res == initial_text else res[-1]
81
+ features = torch.LongTensor([[dataset.ch2idx[c] for c in prev_chars]]).to(device)
82
+ logits, h,c = model(features,h,c)
83
+ probs = F.softmax(logits[0],dim = 0).to("cpu").detach().numpy()
84
+ new_ch = np.random.choice(dataset.vocabulary,p = probs)
85
+ res += new_ch
86
+ return res
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch==2.0.0
2
+ numpy==1.22.4
3
+ gradio==3.30.0
sherlock_holmes_lstm_text_generator_state_dict.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3de74bb4a4c22283c32ddf4b1691ac6cada188e1178720d16ae3ecedb0712350
3
+ size 2564047