mattyhew commited on
Commit
5d3fe93
1 Parent(s): 6ff6612

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +47 -0
  2. config.json +10 -0
  3. gpt_dev.py +444 -0
  4. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from gpt_dev import GPTLanguageModel, encode, decode, generate_text # Assuming these are in gpt_dev.py
4
+
5
+ # Initialize model parameters and load the pre-trained model
6
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7
+
8
+ # Parameters (adjust based on your model's architecture)
9
+ block_size = 256
10
+ n_embd = 384
11
+ n_head = 6
12
+ n_layer = 6
13
+ vocab_size = 95
14
+
15
+ # Initialize the model
16
+ model = GPTLanguageModel()
17
+ model.to(device)
18
+
19
+ # Load the saved model weights
20
+ checkpoint = torch.load("gpt_language_model.pth", map_location=device)
21
+ model.load_state_dict(checkpoint)
22
+ model.eval() # Set the model to evaluation mode
23
+
24
+ # Define the text generation function
25
+ def generate_response(prompt, max_length=100, temperature=1.0):
26
+ generated_text = generate_text(model, prompt, max_length=max_length, temperature=temperature)
27
+ return generated_text
28
+
29
+ # Gradio interface
30
+ def gradio_interface(prompt, max_length=100, temperature=1.0):
31
+ return generate_response(prompt, max_length, temperature)
32
+
33
+ # Set up Gradio UI using gr.components (for Gradio 3.x or later)
34
+ interface = gr.Interface(
35
+ fn=gradio_interface,
36
+ inputs=[
37
+ gr.Textbox(label="Prompt", value="Once upon a time"),
38
+ gr.Slider(50, 240, step=1, value=75, label="Max Length"),
39
+ ],
40
+ outputs="text",
41
+ title="Odeyssey Rhyme Generator",
42
+ description="Enter a prompt to generate text."
43
+ )
44
+
45
+ # Launch the Gradio interface
46
+ if __name__ == "__main__":
47
+ interface.launch(share=True) # Set share=False if you don't want to share publicly
config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "block_size": 256,
3
+ "model_type": "gpt",
4
+ "n_embd": 384,
5
+ "n_head": 6,
6
+ "n_layer": 6,
7
+ "transformers_version": "4.44.2",
8
+ "vocab_size": 95
9
+ }
10
+
gpt_dev.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """gpt-dev.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1wAoJHP666APJNiFpvBVvJRpMwe04P4_1
8
+ """
9
+
10
+ # when you restart a Jupyter Notebook, even if you see the outputs from the previous session, the variables, functions, and states in memory are lost. You need to re-run the cells to reload everything into memory.
11
+ import torch
12
+ import torch.nn as nn
13
+ from torch.nn import functional as F
14
+
15
+ import urllib.request
16
+
17
+ # Function to download the file and read its contents
18
+ def load_text_file(url):
19
+ """Download and read the contents of a text file."""
20
+ # Download the file
21
+ response = urllib.request.urlopen(url)
22
+ content = response.read().decode('utf-8') # Read and decode the content
23
+ return content
24
+
25
+ # URL to download the text file
26
+ url = "https://raw.githubusercontent.com/PratyushChaudhary/My-LLM/refs/heads/main/cleaned_text_output.txt"
27
+
28
+ # Load the text into the variable `text`
29
+ text = load_text_file(url)
30
+
31
+
32
+ # Get no. of characters you are dealing with plus it's count
33
+ chars = sorted(list(set(text))) # all content in sorted order
34
+ vocab_size = len(chars) # no. of characters in file
35
+ # print(''.join(chars)) # join of these characters, unique ones
36
+ # print(vocab_size)
37
+
38
+ # hyperparameters
39
+ batch_size = 64 # how many independent sequences will we process in parallel?
40
+ block_size = 256 # what is the maximum content length for predictions?
41
+ max_iters = 5000
42
+ eval_interval = 500
43
+ learning_rate = 3e-4
44
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
45
+ eval_iters = 200
46
+ n_embd = 384
47
+ n_head = 6
48
+ n_layer = 6
49
+ dropout = 0.2
50
+ #----
51
+ torch.manual_seed(1337)
52
+
53
+ # create a mapping from characters to integers
54
+ stoi = { ch:i for i,ch in enumerate(chars) }
55
+ itos = { i:ch for i,ch in enumerate(chars) }
56
+ encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
57
+ decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
58
+
59
+ # # This code offers a very basic form of tokeniser, wherein the seuqence of integers is long but the integers in the sequence are small, in other words the vocabulary is small
60
+
61
+ # print(encode("hi there"))
62
+ # print(decode(encode("hi there")))
63
+
64
+ # let's now tokenise/encode our whole set of text
65
+ import torch # import PyTorch library
66
+ data = torch.tensor(encode(text), dtype=torch.long)
67
+
68
+ # data.shape returns tuple representing dimensions of tensor
69
+ '''
70
+ Tensor:
71
+ A fundamental data structure in ML.
72
+ A multi-dimensional array used to store data. It generalizes matrices to higher dimensions and can be thought of as a container for numerical data.
73
+ '''
74
+ # print(data.shape, data.dtype)
75
+
76
+ # print(data[:1000]) # the first 1000 characters, this is how the GPT will look our words
77
+
78
+ # This would be used to check at the end as how well our model is overfitting.
79
+ '''
80
+ Overfitting:
81
+ Overfitting is a common problem in machine learning and statistical modeling where a model learns not just the underlying patterns in the training data but also the noise or random fluctuations. This results in a model that performs very well on the training data but poorly on new, unseen data.
82
+ '''
83
+
84
+ # Let's now split up the data into train and validation sets
85
+ n = int(0.9*len(data)) # first 90% will be train data, rest would be validation
86
+ train_data = data[:n]
87
+ val_data = data[n:]
88
+
89
+ # We will train the transformer on chunks of dataset/text so that it's computationally inexpensive
90
+ # block size states the max length of our chunks
91
+ # block_size = 8
92
+ # train_data[:block_size+1]
93
+ # predictions are made on the basis of relative positions of these tokens
94
+
95
+ # x = train_data[:block_size]
96
+ # y = train_data[1:block_size+1]
97
+ # for t in range(block_size):
98
+ # context = x[:t+1]
99
+ # target = y[t]
100
+ # print(f"When input is {context} the target: {target}")
101
+
102
+ # Using the below code you ensure that any random numbers generated by PyTorch are reproducible, which means when you run the code multiple times, you'll get the same random numbers each time.
103
+ # This is useful for debugging or comparing results.
104
+ # The specific value doesn't matter, it's just used to initialise the random number generator in a consistent way.
105
+ # torch.manual_seed(1337)
106
+ # batch_size = 4 # how many independent sequences will we process in parallel?
107
+ # block_size = 8 # what is the maximum context length for predictions?
108
+ if __name__ == "__main__":
109
+ # Training logic (if any) goes here
110
+ # This will only run when you execute gpt_dev.py directly, not when it's imported
111
+ def get_batch(split):
112
+ # generate a small batch of data of inputs x and targets y
113
+ data = train_data if split == 'train' else val_data
114
+ # The below code generates batch_size (4) nos. in the range 0 to len(data)-block_size (exclusive)
115
+ # These integers, stored in ix, are used as starting indexes to slice the data
116
+ ix = torch.randint(len(data) - block_size, (batch_size,))
117
+ # stack up the rows into a tensor
118
+ x = torch.stack([data[i:i+block_size] for i in ix])
119
+ y = torch.stack([data[i+1:i+block_size+1] for i in ix])
120
+ x, y = x.to(device), y.to(device)
121
+ return x, y
122
+
123
+ @torch.no_grad()
124
+ def estimate_loss():
125
+ out = {}
126
+ model.eval()
127
+ for split in {'train', 'val'}:
128
+ losses = torch.zeros(eval_iters)
129
+ for k in range(eval_iters):
130
+ X, Y = get_batch(split)
131
+ logits, loss = model(X, Y)
132
+ losses[k] = loss.item()
133
+ out[split] = losses.mean()
134
+ model.train()
135
+ return out
136
+ pass
137
+
138
+
139
+ # xb, yb = get_batch('train')
140
+ # print('inputs:')
141
+ # print(xb.shape)
142
+ # print(xb)
143
+ # print('targets:')
144
+ # print(yb.shape)
145
+ # print(yb)
146
+
147
+ # print('----')
148
+
149
+ # for b in range(batch_size): # batch dimension
150
+ # for t in range(block_size): # time dimension
151
+ # context = xb[b, :t+1]
152
+ # target = yb[b, t]
153
+ # print(f"when input is {context.tolist()} the target: {target}")
154
+
155
+ # import torch.nn as nn
156
+ # # below syntax is such because nn is a submodule of torch, and Python needs the full module path (torch.nn) to find the functional module correctly.
157
+ # from torch.nn import functional as F
158
+ # torch.manual_seed(1337)
159
+
160
+ class Head(nn.Module):
161
+ '''one head of self-attention'''
162
+ def __init__(self, head_size):
163
+ super().__init__()
164
+ self.key = nn.Linear(n_embd, head_size, bias = False)
165
+ self.query = nn.Linear(n_embd, head_size, bias = False)
166
+ self.value = nn.Linear(n_embd, head_size, bias = False)
167
+ self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
168
+
169
+ self.dropout = nn.Dropout(dropout)
170
+
171
+ def forward(self, x):
172
+ B, T, C = x.shape
173
+ k = self.key(x) # (B, T, C)
174
+ q = self.query(x) # (B, T, C)
175
+ # complete attention scores ("affinities")
176
+ wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T) wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
177
+ wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
178
+ wei = F.softmax(wei, dim=-1) # (B, T, T)
179
+ wei = self.dropout(wei)
180
+ # perform the weighted aggregation of the values
181
+ v = self.value(x) # (B, T, C)
182
+ out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
183
+ return out
184
+
185
+ class MultiHeadAttention(nn.Module):
186
+ '''multiple heads of self-attention in parallel'''
187
+
188
+ def __init__(self, num_heads, head_size):
189
+ super().__init__()
190
+ self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
191
+ self.proj = nn.Linear(head_size * num_heads, n_embd)
192
+ self.dropout = nn.Dropout(dropout)
193
+
194
+ def forward(self, x):
195
+ out = torch.cat([h(x) for h in self.heads], dim = -1)
196
+ out = self.dropout(self.proj(out))
197
+ return out
198
+
199
+ class FeedForward(nn.Module):
200
+ ''' a simple linear layer followed by a non-linearity '''
201
+
202
+ def __init__(self, n_embd):
203
+ super().__init__()
204
+ self.net = nn.Sequential(
205
+ nn.Linear(n_embd, 4 * n_embd),
206
+ nn.ReLU(),
207
+ nn.Linear(4 * n_embd, n_embd),
208
+ nn.Dropout(dropout),
209
+ )
210
+
211
+ def forward(self, x):
212
+ return self.net(x)
213
+
214
+ class Block(nn.Module):
215
+ '''Transformer block: communication followed by computation'''
216
+
217
+ def __init__(self, n_embd, n_head):
218
+ # n_embd: embedding dimension, n_head: the number of heads we'd like
219
+ super().__init__()
220
+ head_size = n_embd // n_head
221
+ self.sa = MultiHeadAttention(n_head, head_size)
222
+ self.ffwd = FeedForward(n_embd)
223
+ self.ln1 = nn.LayerNorm(n_embd)
224
+ self.ln2 = nn.LayerNorm(n_embd)
225
+
226
+ def forward(self, x):
227
+ x = x + self.sa(self.ln1(x))
228
+ x = x + self.ffwd(self.ln2(x))
229
+ return x
230
+
231
+
232
+ # A bigram langauge model is a type of statistical language model that predicts the probability of a word based on the preceding word. It assumes that the occurence of a word depends only on the previous word.
233
+ class GPTLanguageModel(nn.Module):
234
+
235
+ def __init__(self):
236
+ super().__init__()
237
+ # each token directly reads off the logits for the next token from a lookup table
238
+ # nn.Embedding(vocab_size, vocab_size): vocab_size is the size of the vocabulary; each token is represented by a vector of size vocab_size, effectively creating a look up table of token embeddings.
239
+ self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
240
+ self.position_embedding_table = nn.Embedding(block_size, n_embd)
241
+ self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
242
+ self.ln_f = nn.LayerNorm(n_embd) # final layer norm
243
+ # self.blocks = nn.Sequential(
244
+ # Block(n_embd, n_head = 4),
245
+ # Block(n_embd, n_head = 4),
246
+ # Block(n_embd, n_head = 4),
247
+ # nn.LayerNorm(n_embd),
248
+ # )
249
+ # self.sa_heads = MultiHeadAttention(4, n_embd//4) # i.e. 4 heads of 8-dimensional self-attention
250
+ # self.ffwd = FeedForward(n_embd)
251
+ self.lm_head = nn.Linear(n_embd, vocab_size)
252
+ self.apply(self._init_weights)
253
+
254
+ def _init_weights(self, module):
255
+ if isinstance(module, nn.Linear):
256
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
257
+ if module.bias is not None:
258
+ torch.nn.init.zeros_(module.bias)
259
+ elif isinstance(module, nn.Embedding):
260
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
261
+
262
+ # docstrings to be placed at same indentation to avoid confusion
263
+ '''
264
+ Batch is the number of sequences in the batch.
265
+ Time is the length of each sequence.
266
+ Channels is the size of the embedding (equal to vocab_size).
267
+ '''
268
+
269
+ # In context of neural networks, the "forward pass" refers to the process of passing input data through the network to obtain predictions or outputs.
270
+ def forward(self, idx, targets = None):
271
+ B, T = idx.shape
272
+
273
+ # idx and targets are both (B,T) tensor of integers
274
+ tok_emb = self.token_embedding_table(idx) # (Batch, Time, Channels) Batch -> 4 Time -> 8 Channels -> vocab_size
275
+ pos_emb = self.position_embedding_table(torch.arange(T, device = device)) # (T, C)
276
+ x = tok_emb + pos_emb # (B, T, C)
277
+ # x = self.sa_heads(x) # apply one head of self attention. (B, T, C)
278
+ # x = self.ffwd(x) # (B, T, C)
279
+ x = self.blocks(x) # (B, T, C)
280
+ x = self.ln_f(x) # (B, T, C)
281
+ logits = self.lm_head(x) # (B, T, vocab_size)
282
+
283
+ # if no targets, there's no loss to compute.
284
+ if targets is None:
285
+ loss = None
286
+ else:
287
+ # reshaping our logits such that they align with the syntax of cross_entropy
288
+ B, T, C = logits.shape
289
+ logits = logits.view(B*T, C)
290
+ targets = targets.view(B*T)
291
+
292
+ # quality of prediction wrt targets
293
+ # It can be understood as a probability distribution where the correct dimension would be looking like a peak
294
+ loss = F.cross_entropy(logits, targets)
295
+
296
+ return logits, loss
297
+
298
+ # defines the method to generate new tokens based on the current sequence idx.
299
+ def generate(self, idx, max_new_tokens):
300
+ # idx is (B, T) array of indices in the current context
301
+ for _ in range(max_new_tokens):
302
+ # crop idx to the last block_size tokens
303
+ idx_cond = idx[:, -block_size:]
304
+ # get the predictions
305
+ logits, loss = self(idx_cond)
306
+ # focus only on the last time step
307
+ logits = logits[:, -1, :] # becomes (B, C)
308
+ # apply softmax ro get probabilities
309
+ probs = F.softmax(logits, dim = -1) # (B, C)
310
+ # sample from the distribution
311
+ idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
312
+ # append sampled index to the running sequence
313
+ idx = torch.cat((idx, idx_next), dim = 1) # (B, T+1)
314
+ return idx
315
+ model = GPTLanguageModel()
316
+ m = model.to(device)
317
+ # logits, loss = m(xb, yb)
318
+ # print(logits.shape)
319
+ # print(loss)
320
+
321
+
322
+ # print(decode(m.generate(idx = torch.zeros((1, 1), dtype = torch.long), max_new_tokens = 100)[0].tolist()))
323
+
324
+ # m = model.to(device)
325
+ # create a PyTorch optimiser
326
+ optimiser = torch.optim.AdamW(model.parameters(), lr = learning_rate)
327
+
328
+ # batch_size = 32 # This specifies that 32 samples will be processed in one training step called batch.
329
+ # for steps in range(50000): # This loop will run for 100 steps. Each step is one iteration of training using a batch of data.
330
+ # # sample a batch of data
331
+ # xb, yb = get_batch('train')
332
+ # # evaluate the loss
333
+ # # logits are the raw output of the model before any activation function, representing the predicted probabilities for each class.
334
+ # logits, loss = m(xb, yb)
335
+ # optimiser.zero_grad(set_to_none = True)
336
+ # loss.backward()
337
+ # optimiser.step()
338
+
339
+ # print(loss.item())
340
+ def train_model(self, max_iters, eval_interval, optimiser):
341
+ for iter in range(max_iters):
342
+ # every once in a while evaluate the loss on train and val sets
343
+ if iter % eval_interval == 0 or iter == max_iters - 1:
344
+ losses = estimate_loss()
345
+ print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
346
+
347
+ # sample a batch of data
348
+ xb, yb = get_batch('train')
349
+
350
+ # evaluate the loss
351
+ logits, loss = model(xb, yb)
352
+ optimiser.zero_grad(set_to_none = True)
353
+ loss.backward()
354
+ optimiser.step()
355
+
356
+ # generate from the model
357
+ context = torch.zeros((1,1), dtype = torch.long, device = device)
358
+
359
+ """## The mathematical trick in self-attention"""
360
+
361
+ # consider the following toy example:
362
+
363
+ torch.manual_seed(1337)
364
+ B, T, C = 4, 8, 2 # batch, time, channels
365
+ x = torch.randn(B, T, C)
366
+ x.shape
367
+
368
+ # We want x[b, t] = mean_{i<=t} x[b, i]
369
+ xbow = torch.zeros((B, T, C))
370
+ for b in range(B):
371
+ for t in range(T):
372
+ xprev = x[b, :t+1] # (t, C)
373
+ xbow[b, t] = torch.mean(xprev, 0)
374
+
375
+ # version 2
376
+ wei = torch.tril(torch.ones(T, T))
377
+ wei = wei / wei.sum(1, keepdim = True)
378
+ xbow2 = wei @ x # (B, T, T) @ (B, T, C) ---> (B, T, C)
379
+ torch.allclose(xbow, xbow2)
380
+
381
+ # version 3: use Softmax
382
+ tril = torch.tril(torch.ones(T, T))
383
+ wei = torch.zeros((T, T))
384
+ wei = wei.masked_fill(tril == 0, float('-inf'))
385
+ wei = F.softmax(wei, dim = -1)
386
+ xbow3 = wei @ x
387
+ torch.allclose(xbow, xbow3)
388
+
389
+ # version 4: self-attention!
390
+ torch.manual_seed(1337)
391
+ B, T, C = 4, 8, 32 # batch, time, channels
392
+ x = torch.randn(B, T, C)
393
+
394
+ # let's see a single Head perform self-attention
395
+ head_size = 16
396
+ key = nn.Linear(C, head_size, bias = False)
397
+ query = nn.Linear(C, head_size, bias = False)
398
+ value = nn.Linear(C, head_size, bias = False)
399
+ k = key(x) # (B, T, 16)
400
+ q = query(x) # (B, T, 16)
401
+ wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)
402
+
403
+ tril = torch.tril(torch.ones(T, T))
404
+ # wei = torch.zeros((T, T))
405
+ wei = wei.masked_fill(tril == 0, float('-inf'))
406
+ wei = F.softmax(wei, dim = -1)
407
+
408
+ v = value(x)
409
+ out = wei @ v
410
+
411
+ k = torch.randn(B, T, head_size)
412
+ q = torch.randn(B, T, head_size)
413
+ wei = q @ k.transpose(-2, -1) * head_size**(-0.5)
414
+
415
+ torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim = -1)
416
+
417
+ # returns the lower triangular part of the given matrix
418
+ torch.tril(torch.ones(3, 3))
419
+
420
+ # we can be very efficient doing the above thing using matrix multiplication
421
+ torch.manual_seed(42)
422
+ a = torch.tril(torch.ones(3, 3))
423
+ # using the below syntax would get us the sum for every row in a as 1
424
+ a = a / torch.sum(a, 1, keepdim = True)
425
+ b = torch.randint(0, 10, (3, 2)).float()
426
+ c = a @ b
427
+
428
+ def generate_text(model, start_prompt, max_length=256, temperature=1.0):
429
+ input_ids = torch.tensor(encode(start_prompt), dtype=torch.long).unsqueeze(0).to(device)
430
+ model.eval()
431
+ generated_ids = input_ids.tolist()[0]
432
+ with torch.no_grad():
433
+ for _ in range(max_length):
434
+ logits, _ = model(input_ids)
435
+ logits = logits[:, -1, :] / temperature
436
+ probs = torch.nn.functional.softmax(logits, dim=-1)
437
+ next_token = torch.multinomial(probs, num_samples=1)
438
+ generated_ids.append(next_token.item())
439
+ input_ids = torch.cat((input_ids, next_token), dim=1)
440
+ return decode(generated_ids)
441
+
442
+
443
+ if __name__ == "__main__":
444
+ train_model()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ gradio
3
+ huggingface-hub