yhavinga commited on
Commit
abb7aba
1 Parent(s): 806921f

Changes after running replace_token_script.py

Browse files
Files changed (4) hide show
  1. merges.txt +0 -1
  2. replace_token_script.py +80 -0
  3. tokenizer.json +0 -0
  4. vocab.json +0 -0
merges.txt CHANGED
@@ -49994,4 +49994,3 @@ ar it
49994
  ov na
49995
  ĠLe f
49996
  Ġmoed willig
49997
- Ġgeïn teresseerde
 
49994
  ov na
49995
  ĠLe f
49996
  Ġmoed willig
 
replace_token_script.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ''''This script was used to replace the final index of tokenizer.json and vocab.json
2
+ with "<|endoftext|>" token. Also reassociate the corresponding merges'''
3
+
4
+ import json
5
+
6
+ tokenizer_path = 'tokenizer.json'
7
+ model_config_path = 'config.json'
8
+ vocab_path = 'vocab.json'
9
+
10
+ with open(vocab_path, "r") as f:
11
+ vocab_data = json.load(f)
12
+
13
+ with open(tokenizer_path, "r") as f:
14
+ tokenizer_data = json.load(f)
15
+
16
+ with open(model_config_path, "r") as f:
17
+ model_config = json.load(f)
18
+
19
+ model_vocab_size = model_config['vocab_size']
20
+ tokenizer_vocab = tokenizer_data['model']['vocab']
21
+
22
+ mergeslength = len(tokenizer_data['model']['merges'])
23
+
24
+ #readjust added_tokens 'id' to model_vocab_size - 1
25
+ tokenizer_data['added_tokens'][-1]['id'] = model_vocab_size - 1
26
+
27
+ final_index = model_vocab_size - 1
28
+ eos = '<|endoftext|>'
29
+
30
+ #retrieve the key of final index
31
+ old_key_final_index_tokenizer = list(tokenizer_data['model']['vocab'].keys())[final_index]
32
+ old_key_final_index_vocab = list(vocab_data.keys())[final_index]
33
+ old_key_final_index_vocab_min2 = list(vocab_data.keys())[final_index - 1]
34
+ old_key_final_index_tokenizer_merges = tokenizer_data['model']['merges'][mergeslength - 1]
35
+
36
+ print(f"old_key_final_index_tokenizer = {old_key_final_index_tokenizer}")
37
+ print(f"old_key_final_index_vocab = {old_key_final_index_vocab}")
38
+ print(f"old_key_final_index_vocab_min2 = {old_key_final_index_vocab_min2}")
39
+ print(f"old_key_final_index_tokenizer_merges = {old_key_final_index_tokenizer_merges}")
40
+
41
+ #replace old key with new key
42
+ tokenizer_data['model']['vocab']['<|endoftext|>'] = tokenizer_data['model']['vocab'][old_key_final_index_tokenizer]
43
+ vocab_data[eos] = vocab_data[old_key_final_index_vocab]
44
+
45
+ #replace the final merges idx with vocab_data - 1
46
+ tokenizer_data['model']['merges'] = tokenizer_data['model']['merges'][: mergeslength - 1]
47
+
48
+
49
+ #delete old key
50
+ del tokenizer_data['model']['vocab'][old_key_final_index_tokenizer]
51
+ del vocab_data[old_key_final_index_vocab]
52
+
53
+ #check updated key
54
+ old_key_final_index_tokenizer = list(tokenizer_data['model']['vocab'].keys())[final_index]
55
+ old_key_final_index_vocab = list(vocab_data.keys())[final_index]
56
+ old_key_final_index_tokenizer_merges = tokenizer_data['model']['merges'][mergeslength - 2]
57
+
58
+ print(len(tokenizer_data['model']['merges']))
59
+ print()
60
+ print(f"updated old_key_final_index_tokenizer = {old_key_final_index_tokenizer}")
61
+ print(f"updated old_key_final_index_vocab = {old_key_final_index_vocab}")
62
+ print(f"updated old_key_final_index_tokenizer_merges = {old_key_final_index_tokenizer_merges}")
63
+
64
+ with open(tokenizer_path, "w")as f:
65
+ json.dump(tokenizer_data, f)
66
+
67
+ with open(vocab_path, "w")as f:
68
+ json.dump(vocab_data, f)
69
+
70
+ with open('merges.txt') as f:
71
+ lines = f.readlines()
72
+
73
+ with open("merges.txt", "w") as f:
74
+ for i in range(len(lines) - 1):
75
+ f.write(lines[i])
76
+
77
+ with open('merges.txt') as f:
78
+ newlines = f.readlines()
79
+
80
+ print(f"newlines[len(newlines) - 1] = {newlines[len(newlines) - 1]}")
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
vocab.json CHANGED
The diff for this file is too large to render. See raw diff