My GPT

This is an GPT-2 trained model by using baidu Encyclopedia as the training data. parameter size: 124M
batch size: 100
context size: 1024
n_layers: 12
n_head: 768
n_embed: 768
vocab_size: 50257
wte: 50257768
wpe: 1024768

Generator(Inference)

def generate(prompt='', num_samples=10, steps=20, do_sample=True):

    # tokenize the input prompt into integer input sequence
    vocab2id, id2vocab = get_vocab_and_token()
    tokenizer = MyTokenizer(vocab2id, id2vocab)

    x = tokenizer(prompt)
    x = torch.tensor(x, dtype=torch.long).unsqueeze(0).to(device)


    # we'll process all desired num_samples in a batch, so expand out the batch dim
    x = x.expand(num_samples, -1)

    # forward the model `steps` times to get samples, in a batch
    y = model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=40)

    for i in range(num_samples):
        pred_idxs = y[i].cpu().squeeze().tolist()
        out = tokenizer.convert_id_2_token(pred_idxs)
        print('-' * 80)
        print(out)


if __name__ == '__main__':

    # set_seed(3407)
    model_config = GPT.get_default_config()
    model_config.model_type = 'gpt2'
    model_config.vocab_size = 17543  # openai's model vocabulary
    model_config.block_size = (prompt_length + info_length) - 1
    model = GPT(model_config)
    model.load_state_dict(torch.load('model.bin'))
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()

    generate(prompt='什么是唐诗宋词？', num_samples=5, steps=200)

trainer

class Trainer:

    @staticmethod
    def get_default_config():
        C = CN()
        # device to train on
        C.device = 'auto'
        # dataloder parameters
        C.num_workers = 4
        # optimizer parameters
        C.max_iters = None
        C.batch_size = 64
        C.learning_rate = 3e-4
        C.betas = (0.9, 0.95)
        C.weight_decay = 0.1  # only applied on matmul weights
        C.grad_norm_clip = 1.0
        return C

    def __init__(self, config, model, train_dataset):
        self.config = config
        self.model = model
        self.optimizer = None
        self.train_dataset = train_dataset
        self.callbacks = defaultdict(list)

        # determine the device we'll train on
        if config.device == 'auto':
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = config.device
        self.model = self.model.to(self.device)
        print("running on device", self.device)

        # variables that will be assigned to trainer class later for logging and etc
        self.iter_num = 0
        self.iter_time = 0.0
        self.iter_dt = 0.0

    def add_callback(self, onevent: str, callback):
        self.callbacks[onevent].append(callback)

    def set_callback(self, onevent: str, callback):
        self.callbacks[onevent] = [callback]

    def trigger_callbacks(self, onevent: str):
        # Trigger batch_end_callback function
        for callback in self.callbacks.get(onevent, []):
            callback(self)

    def run(self):
        model, config = self.model, self.config

        # setup the optimizer
        self.optimizer = model.configure_optimizers(config)

        # setup the dataloader
        train_loader = DataLoader(
            self.train_dataset,
            sampler=torch.utils.data.RandomSampler(self.train_dataset,
                                                   replacement=True,
                                                   num_samples=int(1e10)),
            shuffle=False,
            pin_memory=True,
            batch_size=config.batch_size,
            num_workers=config.num_workers,
        )

        model.train()
        self.iter_num = 0
        self.iter_time = time.time()

        question_lst = ['解释一下法律法规的意思', '介绍一下原神是什么', '解释一下核心价值的意思',
                        '解释一下发达国家的意思','解释一下汽车电子的意思','解释一下技术革命的意思','什么是资源整合','啥是慈善活动','能给我讲讲青年一代吗',
                        '给我介绍下中华民族','语音识别的含义是什么','请说明一下奥林匹克运动会的含义','能给我讲讲科技成果是什么']
        with open('data/vocab2id.json', 'r') as f:
            vocab2id = json.load(f)
        with open('data/id2vocab.json', 'r') as f:
            id2vocab = json.load(f)
            id2vocab = {int(k): v for k, v in id2vocab.items()}
        tokenizer = MyTokenizer(vocab2id, id2vocab)

        question_token = []
        for i in range(len(question_lst)):
            question_token.append(tokenizer(question_lst[i]))

        data_iter = iter(train_loader)


        while True:

            # fetch the next batch (x, y) and re-init iterator if needed
            try:
                batch = next(data_iter)
            except StopIteration:
                data_iter = iter(train_loader)
                batch = next(data_iter)
            batch = [t.to(self.device) for t in batch]
            x, y = batch

            # forward the model
            logits, self.loss = model(x, y)

            # backprop and update the parameters
            model.zero_grad(set_to_none=True)
            self.loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           config.grad_norm_clip)
            self.optimizer.step()

            self.trigger_callbacks('on_batch_end')
            self.iter_num += 1
            if self.iter_num % 10000 == 0:

                # question = torch.tensor(question, dtype=torch.long).unsqueeze(0).to(self.device)
                with torch.no_grad():
                    for i in range(len(question_token)):
                        my_question = torch.tensor(question_token[i], dtype=torch.long).unsqueeze(0).to(self.device)
                        y = model.generate(my_question, max_new_tokens=100, do_sample=True, top_k=40)
                        my_ids = y[0].cpu()
                        my_ids = my_ids.tolist()

                        # print(my_ids)
                        CRAWLERLOGGER.debug(f"the prediction: {my_ids}")
                        out = tokenizer.convert_id_2_token(ids=my_ids)
                        # print(out)
                        CRAWLERLOGGER.debug(f"the prediction: {out}")

            tnow = time.time()
            self.iter_dt = tnow - self.iter_time
            self.iter_time = tnow

            # termination conditions
            if config.max_iters is not None and self.iter_num >= config.max_iters:
                break

tokenizer

class MyTokenizer:
    def __init__(self, vocab, token, max_len=180):
        self.vocab = vocab
        self.token = token
        self.max_len = max_len

    def __call__(self, str):
        ids = []

        for w in str:
            if w in self.vocab.keys():
                ids.append(self.vocab[w])

        return ids

    def convert_id_2_token(self, ids):
        token = []

        for id in ids:
            try:
                token.append(self.token[id])
            except:
                print('the error:', id)
                exit()
        # token = [self.token[id] for id in ids]
        return "".join(token)

    def get_vocab_size(self):
        return len(self.vocab) + 1


def load_resource(df, vocabsize=50257):
    word_f = {}
    vocab = {}
    token = {}
    for index, row in tqdm(df.iterrows(), desc='building the vocab library'):

        line = row['prompt'] + row['info']

        for ch in line:
            try:
                word_f[ch] += 1
            except:
                word_f[ch] = 1

    word_f = sorted(word_f.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    token[0] = ""
    token[1] = "<|endoftext|>"
    token[2] = "<pad>"
    id = 3
    for it in word_f:
        vocab[it[0]] = id
        token[id] = it[0]
        id += 1
        if id >= vocabsize:
            break


    with open("vocab2id.json", "w") as f:
        json.dump(vocab, f)

    with open("id2vocab.json", "w") as f:
        json.dump(token, f)


def get_vocab_and_token():
    with open('/AI_TEAM/colinyang/minGPT/data/vocab2id.json', 'r') as f:
        v2id = json.load(f)
    with open('/AI_TEAM/colinyang/minGPT/data/id2vocab.json', 'r') as f:
        id2v = json.load(f)
        id2v = {int(k): v for k, v in id2v.items()}
    return v2id, id2v


if __name__ == "__main__":
    my_df = pd.read_pickle('baidu_wrangling.pkl')
    load_resource(my_df)

    vocab2id, id2vocab = get_vocab_and_token()
    print(len(id2vocab))
    print(id2vocab[2])
    Tokenizer = MyTokenizer(vocab2id, id2vocab)

    text = "科大讯飞"
    ids = Tokenizer(text)
    ids.append(1)
    print(ids)
    print(Tokenizer.convert_id_2_token(ids))