Spaces:

Q-b1t
/

Sherlock_Holmes_Text_Generation_MKI

Sleeping

App Files Files Community

LuisDarioHinojosa commited on May 15, 2023

Commit

b6e7e53

•

1 Parent(s): 98094a3

initial commit

Browse files

Files changed (7) hide show

.gitattributes +1 -0
advs.txt +0 -0
app.py +77 -0
assets/CharacterDataset.py +52 -0
assets/PredictionNetwork.py +86 -0
requirements.txt +3 -0
sherlock_holmes_lstm_text_generator_state_dict.pth +3 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sherlock_holmes_lstm_text_generator_state_dict.pth filter=lfs diff=lfs merge=lfs -text

advs.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# test in gradio application
+from assets.CharacterDataset import CharacterDataset # change in native
+from assets.PredictionNetwork import PredictionNetwork, generate_text # change in native
+from timeit import default_timer as timer
+import numpy as np
+import torch.nn.functional as F
+import torch
+import gradio as gr
+# samples path
+samples_path = "advs.txt" # change in native
+# pretrained weights path
+pretrained_weights_path = "sherlock_holmes_lstm_text_generator_state_dict.pth" # change in native
+# hyperparameters
+vocab_size = 50
+window_size = 40
+embedding_dim = 512
+linear_units = 64
+lstm_units = 32
+n_layers = 64
+max_norm = 2
+# get the text from the textfile
+with open(samples_path,"r") as f:
+  text = "\n".join(f.readlines())
+# instance the dataset
+dataset = CharacterDataset(text,window_size=window_size,vocab_size=vocab_size)
+# instance the model
+model = PredictionNetwork(
+  vocab_size = vocab_size,
+  embedding_dimention = embedding_dim,
+  linear_units = linear_units,
+  lstm_units = lstm_units,
+  max_norm = max_norm,
+  n_layers = n_layers
+)
+# load the pretrained weights
+model.load_state_dict(torch.load(f=pretrained_weights_path,map_location=torch.device('cpu')))
+# text generation function
+def generate_text(n_chars,initial_text):
+  res = initial_text
+  model.eval()
+  h,c = None,None
+  begin_time = timer()
+  with torch.inference_mode():
+    for _ in range(int(n_chars)):
+      prev_chars = initial_text if res == initial_text else res[-1]
+      features = torch.LongTensor([[dataset.ch2idx[c] for c in prev_chars]])
+      logits, h,c = model(features,h,c)
+      probs = F.softmax(logits[0],dim = 0).to("cpu").detach().numpy()
+      new_ch = np.random.choice(dataset.vocabulary,p = probs)
+      res += new_ch
+  end_time = timer()
+  total_time = end_time - begin_time
+  return res,total_time
+# create gradio application
+title = "Sherlock Holmes text generator"
+description = "An LSTM model trained to generate text using the anthology: [The Adventures Of Sherlock Holmes](https://sherlock-holm.es/stories/plain-text/advs.txt)."
+article = "The model was created using Pytorch. It incorporates embedding matrices to code the features of the vocabulary extracted from the book, and it is a partial improvement performance wise from my last project based on [Shakespheare's plays](https://huggingface.co/spaces/Q-b1t/shakespeare_text_generation). I plan on developing another version that incorporates a full transformer in the course of the next few weeks."
+initial_text = "In the year 1878 I took my degree of Doctor of Medicine of the University of London,"
+demo = gr.Interface(
+    fn = generate_text,
+    inputs = [gr.Number(value = 200,label = "Sequence Length",info = "Length of the sample sequence you wish to generate."),gr.TextArea(lines = 5,label="Initial Text",value = initial_text)],
+    outputs = [gr.TextArea(lines = 5,label="Sequence Output"),gr.Number(label = "Execution Time (seconds)")],    title = title,
+    description = description,
+    article = article
+)
+demo.launch()

assets/CharacterDataset.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+from torch.utils.data import Dataset
+from collections import Counter,defaultdict
+class CharacterDataset(Dataset):
+  """
+  text: str
+    Input text that will be used to create the dataset's vocabulary
+  window size: int
+    Number of characters of a sequence to use as input features to make a prediction
+  vocab size: int
+    NUmber of characters in the vocabulary. The last one will be a default for characters absent in the vocabulary
+  Attributes
+  ch2idx:
+    Mapping of the character to the index in the vocabulary. The characters that are unknown.
+  idx2ch:
+    Mapping of the index on the vocabulary to the character
+  vocabulary: list
+    List of all the vocabulary
+  """
+  def __init__(self, text, window_size=1, vocab_size=50):
+    super().__init__()
+    assert len(list(set(text))) > vocab_size, "the vocabulary size must be smaller than the number of tokens in the text"
+    # instance the class attributes
+    self.text = text.replace("\n", " ")
+    self.window_size = window_size
+    self.ch2idx = defaultdict(lambda: vocab_size - 1)
+    # create the vocabulary based on the token frequency in the text. Truncate the vocabulary to the specified length
+    most_common_ch2idx = {
+         x[0]: i
+         for i, x in enumerate(Counter(self.text).most_common()[: (vocab_size - 1)])
+    }
+    self.ch2idx.update(most_common_ch2idx)
+    # add the uknown vocaculary
+    self.ch2idx["~"] = vocab_size - 1
+    # create the reverse dictionary
+    self.idx2ch = {v: k for k, v in self.ch2idx.items()}
+    # create the vocabulary
+    self.vocabulary = [self.idx2ch[i] for i in range(vocab_size)]
+  def __len__(self):
+    return len(self.text) - self.window_size
+  def __getitem__(self, ix):
+    X = torch.LongTensor([self.ch2idx[c] for c in self.text[ix : ix + self.window_size]])
+    y = self.ch2idx[self.text[ix + self.window_size]]
+    return X, y

assets/PredictionNetwork.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+import torch.nn as nn
+import numpy as np
+class PredictionNetwork(nn.Module):
+  """
+  Parameters
+  vocab_size : int
+    THe number of characters in a vocabulary
+  embedding_dimention :  int
+    Dimention of the embedding vectors for each token in the dictionary
+  linear_units : int
+    Number of hidden units in the linear layer
+  lstm_units : int
+    Number of hidden units in the lstm layer
+  max_norm : int
+    If any of the embedding vectors has a higher L2 norm than 'max_norm' it is rescaled.
+  n_layers : int
+    Number of lstm layers
+  """
+  def __init__(self,vocab_size,embedding_dimention = 32, linear_units = 64,lstm_units = 8, max_norm = 2,n_layers = 2):
+    super().__init__()
+    # embedding matrix
+    self.embedding_matrix = nn.Embedding(vocab_size,embedding_dimention,padding_idx=vocab_size -1,norm_type=2,max_norm=max_norm)
+    # lstm block
+    self.lstm_block = nn.LSTM(embedding_dimention,lstm_units,batch_first = True,num_layers = n_layers)
+    # classifier
+    self.classifier = nn.Sequential(
+        nn.Linear(in_features=lstm_units,out_features=linear_units),
+        nn.Linear(in_features=linear_units,out_features=vocab_size)
+    )
+  def forward(self,x,h = None,c = None):
+    """
+    inputs
+    x : torch.Tensor
+      Input tensor of shape (batch_size,window_size)
+    h,c : torch.Tensor or None
+      Hidden states of the LSTM
+    returns
+    logits : torch.Tensor
+      Tensor of shape (batch_size,vocab_size)
+    h,c : torch.Tensor or None
+      Hidden states of the LSTM
+    """
+    emb = self.embedding_matrix(x)
+    if h is not None and c is not None:
+      _,(h,c) = self.lstm_block(emb,(h,c))
+    else:
+      _,(h,c) = self.lstm_block(emb)
+    h_mean = h.mean(dim = 0)
+    logits = self.classifier(h_mean)
+    return logits,h,c
+def generate_text(n_chars,model,dataset,device,initial_text = "W.A.T.S.O.N", random_state = None):
+  res = initial_text
+  model.eval()
+  h,c = None,None
+  if random_state is not None:
+    np.random.seed(random_state)
+  with torch.inference_mode():
+    for _ in range(n_chars):
+      prev_chars = initial_text if res == initial_text else res[-1]
+      features = torch.LongTensor([[dataset.ch2idx[c] for c in prev_chars]]).to(device)
+      logits, h,c = model(features,h,c)
+      probs = F.softmax(logits[0],dim = 0).to("cpu").detach().numpy()
+      new_ch = np.random.choice(dataset.vocabulary,p = probs)
+      res += new_ch
+  return res

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch==2.0.0
+numpy==1.22.4
+gradio==3.30.0

sherlock_holmes_lstm_text_generator_state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3de74bb4a4c22283c32ddf4b1691ac6cada188e1178720d16ae3ecedb0712350
+size 2564047