File size: 5,912 Bytes
1ff18ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336e5d3
1ff18ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe48cb9
1ff18ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f82c46
fe48cb9
 
1ff18ea
fe48cb9
145a816
1ff18ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145a816
1ff18ea
 
 
145a816
1ff18ea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
"""S22.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1pq0UO46D0emoqF8rPuD4cUznmYVSMESO
"""

# Commented out IPython magic to ensure Python compatibility.
# %pip install lightning -q



import torch
import glob
import math
import sys
import time
from pathlib import Path
from typing import Optional, Tuple, Union

import lightning as L
from lightning.fabric.loggers import CSVLogger
from lightning.fabric.strategies import FSDPStrategy

from tsai_gpt.model import GPT, Block, Config
from tsai_gpt.packed_dataset import CombinedDataset, PackedDataset
from tsai_gpt.speed_monitor import SpeedMonitorBase, estimate_flops, measure_flops
from tsai_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
from tsai_gpt.utils import chunked_cross_entropy, get_default_supported_precision, num_parameters, load_checkpoint
import os
import pickle
from contextlib import nullcontext
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tsai_gpt.tokenizer import Tokenizer
import gradio as gr

model_name = "pythia-160m"
name = "redpajama"
out_dir = Path("out") / name
log_interval = 100

hparams = {k: v for k, v in locals().items() if isinstance(v, (int, float, str)) and not k.startswith("_")}
logger = CSVLogger("out", name, flush_logs_every_n_steps=log_interval)

fabric = L.Fabric(devices=1, strategy='auto', precision=None, loggers=logger)



#print(model.transformer.h[0].mlp.fc.weight)

def generate( model, config, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.

        """
        idx = idx.unsqueeze(dim=0)
        for _ in range(max_new_tokens):
            
            # # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= config.block_size else idx[ :,-config.block_size:]
            # forward the model to get the logits for the index in the sequence
            idx_cd = idx
            logits = model(idx_cd)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx



checkpoint_dir = Path('./checkpoints/meta-llama/Llama-2-7b-chat-hf')
token = Tokenizer(checkpoint_dir = checkpoint_dir)

def tsaigpt(start:str , max_new_tokens = 300, num_samples =2, tokeniser= token):



  # -----------------------------------------------------------------------------
    temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
    top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
    seed = 1337
    device = 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
    dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
    compile = False # use PyTorch 2.0 to compile the model to be faster
    #exec(open('configurator.py').read()) # overrides from command line or config file
    # -----------------------------------------------------------------------------

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
    torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
    device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
    ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
    checkpoint_path = Path("out/redpajama/iter-031997-ckpt.pth")
    config = Config.from_name(model_name)
    model = GPT(config)

    load_checkpoint(fabric, model, checkpoint_path)
    #print(model)
    model.eval()
    model.to(device)
    if compile:
        model = torch.compile(model) # requires PyTorch 2.0 (optional)



    start_ids = tokeniser.encode(start).to(device)
    #x = torch.tensor(start_ids, dtype=torch.long, device=device).clone().detach()

    # run generation
    with torch.no_grad():
        with ctx:

                y = generate(model =model, config =config ,  max_new_tokens = max_new_tokens, idx = start_ids ,temperature=1.0, top_k=None)
                #print(decode(y[0].tolist()))
                output = tokeniser.decode(y[0])
    return output

INTERFACE = gr.Interface(fn=tsaigpt, inputs=[gr.Textbox(label= "Prompt", value= 'We know what we are, but know not what we may be'),
                    gr.Slider(minimum = 300, maximum = 500, value= 300,  label= "Maximum number of tokens to be generated")] ,
                    outputs=gr.Text(label= "Generated Text"), title="TSAI_GPT",
                 description="TSAIGPT is a transformer-based language model with only 0.16 billion parameters, trained on RedPajama 1T Sample.",
                 examples = [['We know what we are, but know not what we may be',300],]
                        ).launch(debug=True)