LuisDarioHinojosa
commited on
Commit
•
b6e7e53
1
Parent(s):
98094a3
initial commit
Browse files- .gitattributes +1 -0
- advs.txt +0 -0
- app.py +77 -0
- assets/CharacterDataset.py +52 -0
- assets/PredictionNetwork.py +86 -0
- requirements.txt +3 -0
- sherlock_holmes_lstm_text_generator_state_dict.pth +3 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
sherlock_holmes_lstm_text_generator_state_dict.pth filter=lfs diff=lfs merge=lfs -text
|
advs.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# test in gradio application
|
2 |
+
from assets.CharacterDataset import CharacterDataset # change in native
|
3 |
+
from assets.PredictionNetwork import PredictionNetwork, generate_text # change in native
|
4 |
+
from timeit import default_timer as timer
|
5 |
+
import numpy as np
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import torch
|
8 |
+
import gradio as gr
|
9 |
+
|
10 |
+
# samples path
|
11 |
+
samples_path = "advs.txt" # change in native
|
12 |
+
# pretrained weights path
|
13 |
+
pretrained_weights_path = "sherlock_holmes_lstm_text_generator_state_dict.pth" # change in native
|
14 |
+
|
15 |
+
# hyperparameters
|
16 |
+
vocab_size = 50
|
17 |
+
window_size = 40
|
18 |
+
embedding_dim = 512
|
19 |
+
linear_units = 64
|
20 |
+
lstm_units = 32
|
21 |
+
n_layers = 64
|
22 |
+
max_norm = 2
|
23 |
+
|
24 |
+
# get the text from the textfile
|
25 |
+
with open(samples_path,"r") as f:
|
26 |
+
text = "\n".join(f.readlines())
|
27 |
+
|
28 |
+
# instance the dataset
|
29 |
+
dataset = CharacterDataset(text,window_size=window_size,vocab_size=vocab_size)
|
30 |
+
|
31 |
+
# instance the model
|
32 |
+
model = PredictionNetwork(
|
33 |
+
vocab_size = vocab_size,
|
34 |
+
embedding_dimention = embedding_dim,
|
35 |
+
linear_units = linear_units,
|
36 |
+
lstm_units = lstm_units,
|
37 |
+
max_norm = max_norm,
|
38 |
+
n_layers = n_layers
|
39 |
+
)
|
40 |
+
|
41 |
+
# load the pretrained weights
|
42 |
+
model.load_state_dict(torch.load(f=pretrained_weights_path,map_location=torch.device('cpu')))
|
43 |
+
|
44 |
+
# text generation function
|
45 |
+
def generate_text(n_chars,initial_text):
|
46 |
+
res = initial_text
|
47 |
+
model.eval()
|
48 |
+
h,c = None,None
|
49 |
+
begin_time = timer()
|
50 |
+
with torch.inference_mode():
|
51 |
+
for _ in range(int(n_chars)):
|
52 |
+
prev_chars = initial_text if res == initial_text else res[-1]
|
53 |
+
features = torch.LongTensor([[dataset.ch2idx[c] for c in prev_chars]])
|
54 |
+
logits, h,c = model(features,h,c)
|
55 |
+
probs = F.softmax(logits[0],dim = 0).to("cpu").detach().numpy()
|
56 |
+
new_ch = np.random.choice(dataset.vocabulary,p = probs)
|
57 |
+
res += new_ch
|
58 |
+
end_time = timer()
|
59 |
+
total_time = end_time - begin_time
|
60 |
+
return res,total_time
|
61 |
+
|
62 |
+
# create gradio application
|
63 |
+
title = "Sherlock Holmes text generator"
|
64 |
+
description = "An LSTM model trained to generate text using the anthology: [The Adventures Of Sherlock Holmes](https://sherlock-holm.es/stories/plain-text/advs.txt)."
|
65 |
+
article = "The model was created using Pytorch. It incorporates embedding matrices to code the features of the vocabulary extracted from the book, and it is a partial improvement performance wise from my last project based on [Shakespheare's plays](https://huggingface.co/spaces/Q-b1t/shakespeare_text_generation). I plan on developing another version that incorporates a full transformer in the course of the next few weeks."
|
66 |
+
|
67 |
+
initial_text = "In the year 1878 I took my degree of Doctor of Medicine of the University of London,"
|
68 |
+
|
69 |
+
demo = gr.Interface(
|
70 |
+
fn = generate_text,
|
71 |
+
inputs = [gr.Number(value = 200,label = "Sequence Length",info = "Length of the sample sequence you wish to generate."),gr.TextArea(lines = 5,label="Initial Text",value = initial_text)],
|
72 |
+
outputs = [gr.TextArea(lines = 5,label="Sequence Output"),gr.Number(label = "Execution Time (seconds)")], title = title,
|
73 |
+
description = description,
|
74 |
+
article = article
|
75 |
+
)
|
76 |
+
|
77 |
+
demo.launch()
|
assets/CharacterDataset.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.utils.data import Dataset
|
3 |
+
from collections import Counter,defaultdict
|
4 |
+
|
5 |
+
class CharacterDataset(Dataset):
|
6 |
+
"""
|
7 |
+
text: str
|
8 |
+
Input text that will be used to create the dataset's vocabulary
|
9 |
+
window size: int
|
10 |
+
Number of characters of a sequence to use as input features to make a prediction
|
11 |
+
vocab size: int
|
12 |
+
NUmber of characters in the vocabulary. The last one will be a default for characters absent in the vocabulary
|
13 |
+
|
14 |
+
Attributes
|
15 |
+
ch2idx:
|
16 |
+
Mapping of the character to the index in the vocabulary. The characters that are unknown.
|
17 |
+
idx2ch:
|
18 |
+
Mapping of the index on the vocabulary to the character
|
19 |
+
vocabulary: list
|
20 |
+
List of all the vocabulary
|
21 |
+
"""
|
22 |
+
def __init__(self, text, window_size=1, vocab_size=50):
|
23 |
+
super().__init__()
|
24 |
+
assert len(list(set(text))) > vocab_size, "the vocabulary size must be smaller than the number of tokens in the text"
|
25 |
+
# instance the class attributes
|
26 |
+
self.text = text.replace("\n", " ")
|
27 |
+
self.window_size = window_size
|
28 |
+
self.ch2idx = defaultdict(lambda: vocab_size - 1)
|
29 |
+
|
30 |
+
# create the vocabulary based on the token frequency in the text. Truncate the vocabulary to the specified length
|
31 |
+
most_common_ch2idx = {
|
32 |
+
x[0]: i
|
33 |
+
for i, x in enumerate(Counter(self.text).most_common()[: (vocab_size - 1)])
|
34 |
+
}
|
35 |
+
self.ch2idx.update(most_common_ch2idx)
|
36 |
+
|
37 |
+
# add the uknown vocaculary
|
38 |
+
self.ch2idx["~"] = vocab_size - 1
|
39 |
+
|
40 |
+
# create the reverse dictionary
|
41 |
+
self.idx2ch = {v: k for k, v in self.ch2idx.items()}
|
42 |
+
|
43 |
+
# create the vocabulary
|
44 |
+
self.vocabulary = [self.idx2ch[i] for i in range(vocab_size)]
|
45 |
+
|
46 |
+
def __len__(self):
|
47 |
+
return len(self.text) - self.window_size
|
48 |
+
|
49 |
+
def __getitem__(self, ix):
|
50 |
+
X = torch.LongTensor([self.ch2idx[c] for c in self.text[ix : ix + self.window_size]])
|
51 |
+
y = self.ch2idx[self.text[ix + self.window_size]]
|
52 |
+
return X, y
|
assets/PredictionNetwork.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
class PredictionNetwork(nn.Module):
|
6 |
+
"""
|
7 |
+
Parameters
|
8 |
+
vocab_size : int
|
9 |
+
THe number of characters in a vocabulary
|
10 |
+
|
11 |
+
embedding_dimention : int
|
12 |
+
Dimention of the embedding vectors for each token in the dictionary
|
13 |
+
|
14 |
+
linear_units : int
|
15 |
+
Number of hidden units in the linear layer
|
16 |
+
|
17 |
+
lstm_units : int
|
18 |
+
Number of hidden units in the lstm layer
|
19 |
+
|
20 |
+
max_norm : int
|
21 |
+
If any of the embedding vectors has a higher L2 norm than 'max_norm' it is rescaled.
|
22 |
+
|
23 |
+
n_layers : int
|
24 |
+
Number of lstm layers
|
25 |
+
"""
|
26 |
+
|
27 |
+
def __init__(self,vocab_size,embedding_dimention = 32, linear_units = 64,lstm_units = 8, max_norm = 2,n_layers = 2):
|
28 |
+
super().__init__()
|
29 |
+
|
30 |
+
# embedding matrix
|
31 |
+
self.embedding_matrix = nn.Embedding(vocab_size,embedding_dimention,padding_idx=vocab_size -1,norm_type=2,max_norm=max_norm)
|
32 |
+
|
33 |
+
# lstm block
|
34 |
+
self.lstm_block = nn.LSTM(embedding_dimention,lstm_units,batch_first = True,num_layers = n_layers)
|
35 |
+
|
36 |
+
# classifier
|
37 |
+
self.classifier = nn.Sequential(
|
38 |
+
nn.Linear(in_features=lstm_units,out_features=linear_units),
|
39 |
+
nn.Linear(in_features=linear_units,out_features=vocab_size)
|
40 |
+
)
|
41 |
+
|
42 |
+
def forward(self,x,h = None,c = None):
|
43 |
+
"""
|
44 |
+
inputs
|
45 |
+
x : torch.Tensor
|
46 |
+
Input tensor of shape (batch_size,window_size)
|
47 |
+
h,c : torch.Tensor or None
|
48 |
+
Hidden states of the LSTM
|
49 |
+
|
50 |
+
returns
|
51 |
+
logits : torch.Tensor
|
52 |
+
Tensor of shape (batch_size,vocab_size)
|
53 |
+
|
54 |
+
h,c : torch.Tensor or None
|
55 |
+
Hidden states of the LSTM
|
56 |
+
"""
|
57 |
+
|
58 |
+
emb = self.embedding_matrix(x)
|
59 |
+
if h is not None and c is not None:
|
60 |
+
_,(h,c) = self.lstm_block(emb,(h,c))
|
61 |
+
else:
|
62 |
+
_,(h,c) = self.lstm_block(emb)
|
63 |
+
|
64 |
+
h_mean = h.mean(dim = 0)
|
65 |
+
logits = self.classifier(h_mean)
|
66 |
+
|
67 |
+
return logits,h,c
|
68 |
+
|
69 |
+
|
70 |
+
def generate_text(n_chars,model,dataset,device,initial_text = "W.A.T.S.O.N", random_state = None):
|
71 |
+
res = initial_text
|
72 |
+
model.eval()
|
73 |
+
h,c = None,None
|
74 |
+
|
75 |
+
if random_state is not None:
|
76 |
+
np.random.seed(random_state)
|
77 |
+
|
78 |
+
with torch.inference_mode():
|
79 |
+
for _ in range(n_chars):
|
80 |
+
prev_chars = initial_text if res == initial_text else res[-1]
|
81 |
+
features = torch.LongTensor([[dataset.ch2idx[c] for c in prev_chars]]).to(device)
|
82 |
+
logits, h,c = model(features,h,c)
|
83 |
+
probs = F.softmax(logits[0],dim = 0).to("cpu").detach().numpy()
|
84 |
+
new_ch = np.random.choice(dataset.vocabulary,p = probs)
|
85 |
+
res += new_ch
|
86 |
+
return res
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
torch==2.0.0
|
2 |
+
numpy==1.22.4
|
3 |
+
gradio==3.30.0
|
sherlock_holmes_lstm_text_generator_state_dict.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3de74bb4a4c22283c32ddf4b1691ac6cada188e1178720d16ae3ecedb0712350
|
3 |
+
size 2564047
|