My GPT
This is an GPT-2 trained model by using baidu Encyclopedia as the training data.
parameter size: 124M
batch size: 100
context size: 1024
n_layers: 12
n_head: 768
n_embed: 768
vocab_size: 50257
wte: 50257768
wpe: 1024768
Generator(Inference)
def generate(prompt='', num_samples=10, steps=20, do_sample=True):
# tokenize the input prompt into integer input sequence
vocab2id, id2vocab = get_vocab_and_token()
tokenizer = MyTokenizer(vocab2id, id2vocab)
x = tokenizer(prompt)
x = torch.tensor(x, dtype=torch.long).unsqueeze(0).to(device)
# we'll process all desired num_samples in a batch, so expand out the batch dim
x = x.expand(num_samples, -1)
# forward the model `steps` times to get samples, in a batch
y = model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=40)
for i in range(num_samples):
pred_idxs = y[i].cpu().squeeze().tolist()
out = tokenizer.convert_id_2_token(pred_idxs)
print('-' * 80)
print(out)
if __name__ == '__main__':
# set_seed(3407)
model_config = GPT.get_default_config()
model_config.model_type = 'gpt2'
model_config.vocab_size = 17543 # openai's model vocabulary
model_config.block_size = (prompt_length + info_length) - 1
model = GPT(model_config)
model.load_state_dict(torch.load('model.bin'))
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()
generate(prompt='什么是唐诗宋词?', num_samples=5, steps=200)
trainer
class Trainer:
@staticmethod
def get_default_config():
C = CN()
# device to train on
C.device = 'auto'
# dataloder parameters
C.num_workers = 4
# optimizer parameters
C.max_iters = None
C.batch_size = 64
C.learning_rate = 3e-4
C.betas = (0.9, 0.95)
C.weight_decay = 0.1 # only applied on matmul weights
C.grad_norm_clip = 1.0
return C
def __init__(self, config, model, train_dataset):
self.config = config
self.model = model
self.optimizer = None
self.train_dataset = train_dataset
self.callbacks = defaultdict(list)
# determine the device we'll train on
if config.device == 'auto':
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
else:
self.device = config.device
self.model = self.model.to(self.device)
print("running on device", self.device)
# variables that will be assigned to trainer class later for logging and etc
self.iter_num = 0
self.iter_time = 0.0
self.iter_dt = 0.0
def add_callback(self, onevent: str, callback):
self.callbacks[onevent].append(callback)
def set_callback(self, onevent: str, callback):
self.callbacks[onevent] = [callback]
def trigger_callbacks(self, onevent: str):
# Trigger batch_end_callback function
for callback in self.callbacks.get(onevent, []):
callback(self)
def run(self):
model, config = self.model, self.config
# setup the optimizer
self.optimizer = model.configure_optimizers(config)
# setup the dataloader
train_loader = DataLoader(
self.train_dataset,
sampler=torch.utils.data.RandomSampler(self.train_dataset,
replacement=True,
num_samples=int(1e10)),
shuffle=False,
pin_memory=True,
batch_size=config.batch_size,
num_workers=config.num_workers,
)
model.train()
self.iter_num = 0
self.iter_time = time.time()
question_lst = ['解释一下法律法规的意思', '介绍一下原神是什么', '解释一下核心价值的意思',
'解释一下发达国家的意思','解释一下汽车电子的意思','解释一下技术革命的意思','什么是资源整合','啥是慈善活动','能给我讲讲青年一代吗',
'给我介绍下中华民族','语音识别的含义是什么','请说明一下奥林匹克运动会的含义','能给我讲讲科技成果是什么']
with open('data/vocab2id.json', 'r') as f:
vocab2id = json.load(f)
with open('data/id2vocab.json', 'r') as f:
id2vocab = json.load(f)
id2vocab = {int(k): v for k, v in id2vocab.items()}
tokenizer = MyTokenizer(vocab2id, id2vocab)
question_token = []
for i in range(len(question_lst)):
question_token.append(tokenizer(question_lst[i]))
data_iter = iter(train_loader)
while True:
# fetch the next batch (x, y) and re-init iterator if needed
try:
batch = next(data_iter)
except StopIteration:
data_iter = iter(train_loader)
batch = next(data_iter)
batch = [t.to(self.device) for t in batch]
x, y = batch
# forward the model
logits, self.loss = model(x, y)
# backprop and update the parameters
model.zero_grad(set_to_none=True)
self.loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(),
config.grad_norm_clip)
self.optimizer.step()
self.trigger_callbacks('on_batch_end')
self.iter_num += 1
if self.iter_num % 10000 == 0:
# question = torch.tensor(question, dtype=torch.long).unsqueeze(0).to(self.device)
with torch.no_grad():
for i in range(len(question_token)):
my_question = torch.tensor(question_token[i], dtype=torch.long).unsqueeze(0).to(self.device)
y = model.generate(my_question, max_new_tokens=100, do_sample=True, top_k=40)
my_ids = y[0].cpu()
my_ids = my_ids.tolist()
# print(my_ids)
CRAWLERLOGGER.debug(f"the prediction: {my_ids}")
out = tokenizer.convert_id_2_token(ids=my_ids)
# print(out)
CRAWLERLOGGER.debug(f"the prediction: {out}")
tnow = time.time()
self.iter_dt = tnow - self.iter_time
self.iter_time = tnow
# termination conditions
if config.max_iters is not None and self.iter_num >= config.max_iters:
break
tokenizer
class MyTokenizer:
def __init__(self, vocab, token, max_len=180):
self.vocab = vocab
self.token = token
self.max_len = max_len
def __call__(self, str):
ids = []
for w in str:
if w in self.vocab.keys():
ids.append(self.vocab[w])
return ids
def convert_id_2_token(self, ids):
token = []
for id in ids:
try:
token.append(self.token[id])
except:
print('the error:', id)
exit()
# token = [self.token[id] for id in ids]
return "".join(token)
def get_vocab_size(self):
return len(self.vocab) + 1
def load_resource(df, vocabsize=50257):
word_f = {}
vocab = {}
token = {}
for index, row in tqdm(df.iterrows(), desc='building the vocab library'):
line = row['prompt'] + row['info']
for ch in line:
try:
word_f[ch] += 1
except:
word_f[ch] = 1
word_f = sorted(word_f.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
token[0] = ""
token[1] = "<|endoftext|>"
token[2] = "<pad>"
id = 3
for it in word_f:
vocab[it[0]] = id
token[id] = it[0]
id += 1
if id >= vocabsize:
break
with open("vocab2id.json", "w") as f:
json.dump(vocab, f)
with open("id2vocab.json", "w") as f:
json.dump(token, f)
def get_vocab_and_token():
with open('/AI_TEAM/colinyang/minGPT/data/vocab2id.json', 'r') as f:
v2id = json.load(f)
with open('/AI_TEAM/colinyang/minGPT/data/id2vocab.json', 'r') as f:
id2v = json.load(f)
id2v = {int(k): v for k, v in id2v.items()}
return v2id, id2v
if __name__ == "__main__":
my_df = pd.read_pickle('baidu_wrangling.pkl')
load_resource(my_df)
vocab2id, id2vocab = get_vocab_and_token()
print(len(id2vocab))
print(id2vocab[2])
Tokenizer = MyTokenizer(vocab2id, id2vocab)
text = "科大讯飞"
ids = Tokenizer(text)
ids.append(1)
print(ids)
print(Tokenizer.convert_id_2_token(ids))