gpt-neo-125M-dutch / replace_token_script.py
yhavinga's picture
Saving scripts, log and checkpoint at step 70000
f1818f3
raw history blame
No virus
2.93 kB
''''This script was used to replace the final index of tokenizer.json and vocab.json
with "<|endoftext|>" token. Also reassociate the corresponding merges'''
import json
tokenizer_path = 'tokenizer.json'
model_config_path = 'config.json'
vocab_path = 'vocab.json'
with open(vocab_path, "r") as f:
vocab_data = json.load(f)
with open(tokenizer_path, "r") as f:
tokenizer_data = json.load(f)
with open(model_config_path, "r") as f:
model_config = json.load(f)
model_vocab_size = model_config['vocab_size']
tokenizer_vocab = tokenizer_data['model']['vocab']
mergeslength = len(tokenizer_data['model']['merges'])
#readjust added_tokens 'id' to model_vocab_size - 1
tokenizer_data['added_tokens'][-1]['id'] = model_vocab_size - 1
final_index = model_vocab_size - 1
eos = '<|endoftext|>'
#retrieve the key of final index
old_key_final_index_tokenizer = list(tokenizer_data['model']['vocab'].keys())[final_index]
old_key_final_index_vocab = list(vocab_data.keys())[final_index]
old_key_final_index_vocab_min2 = list(vocab_data.keys())[final_index - 1]
old_key_final_index_tokenizer_merges = tokenizer_data['model']['merges'][mergeslength - 1]
print(f"old_key_final_index_tokenizer = {old_key_final_index_tokenizer}")
print(f"old_key_final_index_vocab = {old_key_final_index_vocab}")
print(f"old_key_final_index_vocab_min2 = {old_key_final_index_vocab_min2}")
print(f"old_key_final_index_tokenizer_merges = {old_key_final_index_tokenizer_merges}")
#replace old key with new key
tokenizer_data['model']['vocab']['<|endoftext|>'] = tokenizer_data['model']['vocab'][old_key_final_index_tokenizer]
vocab_data[eos] = vocab_data[old_key_final_index_vocab]
#replace the final merges idx with vocab_data - 1
tokenizer_data['model']['merges'] = tokenizer_data['model']['merges'][: mergeslength - 1]
#delete old key
del tokenizer_data['model']['vocab'][old_key_final_index_tokenizer]
del vocab_data[old_key_final_index_vocab]
#check updated key
old_key_final_index_tokenizer = list(tokenizer_data['model']['vocab'].keys())[final_index]
old_key_final_index_vocab = list(vocab_data.keys())[final_index]
old_key_final_index_tokenizer_merges = tokenizer_data['model']['merges'][mergeslength - 2]
print(len(tokenizer_data['model']['merges']))
print()
print(f"updated old_key_final_index_tokenizer = {old_key_final_index_tokenizer}")
print(f"updated old_key_final_index_vocab = {old_key_final_index_vocab}")
print(f"updated old_key_final_index_tokenizer_merges = {old_key_final_index_tokenizer_merges}")
with open(tokenizer_path, "w")as f:
json.dump(tokenizer_data, f)
with open(vocab_path, "w")as f:
json.dump(vocab_data, f)
with open('merges.txt') as f:
lines = f.readlines()
with open("merges.txt", "w") as f:
for i in range(len(lines) - 1):
f.write(lines[i])
with open('merges.txt') as f:
newlines = f.readlines()
print(f"newlines[len(newlines) - 1] = {newlines[len(newlines) - 1]}")