turboderp commited on
Commit
3c2fd60
·
verified ·
1 Parent(s): 83e48c2

Upload vocab_transplant.py

Browse files
Files changed (1) hide show
  1. vocab_transplant.py +53 -0
vocab_transplant.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, torch.nn as nn
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+
4
+ source_dir = "/mnt/str/models/qwen2-0.5b-instruct"
5
+ target_dir = "/mnt/str/models/llama3-70b-instruct"
6
+ output_dir = "/mnt/str/temp/transplant"
7
+
8
+ # Load model and tokenizers
9
+ model = AutoModelForCausalLM.from_pretrained(source_dir, device_map = "auto")
10
+ tokenizer_source = AutoTokenizer.from_pretrained(source_dir)
11
+ tokenizer_target = AutoTokenizer.from_pretrained(target_dir)
12
+ tied = model.config.tie_word_embeddings
13
+ target_vocab_size = max(tokenizer_target.vocab.values()) + 1 # vocab_size member seems to be unreliable
14
+
15
+ # Embedding tensor
16
+ old_emb = model.model.embed_tokens.weight
17
+ new_emb = torch.empty((target_vocab_size, model.config.hidden_size),
18
+ dtype = old_emb.dtype, device = old_emb.device)
19
+
20
+ # Head tensor
21
+ old_head = model.lm_head.weight
22
+ new_head = torch.empty((target_vocab_size, model.config.hidden_size),
23
+ dtype = old_head.dtype, device = old_head.device)
24
+
25
+ # Initialize new tensors
26
+ for idx in range(target_vocab_size):
27
+ decode = tokenizer_target.decode(torch.tensor(idx, dtype = torch.long), decode_special_tokens = True)
28
+ encode = tokenizer_source.encode(decode, add_special_tokens = False, return_tensors = "pt")
29
+ new_emb[idx] = old_emb[encode.flatten()].mean(dim = 0)
30
+ new_head[idx] = old_head[encode.flatten()].mean(dim = 0)
31
+
32
+ # Replace embedding tensor
33
+ model.model.embed_tokens.weight = nn.Parameter(new_emb, requires_grad = False)
34
+ model.model.embed_tokens.num_embeddings = target_vocab_size
35
+
36
+ # Replace head tensor
37
+ model.lm_head.weight = nn.Parameter(new_head, requires_grad = False)
38
+ model.lm_head.out_features = tokenizer_target.vocab_size
39
+
40
+ # Update model
41
+ model.vocab_size = target_vocab_size
42
+ model.config.vocab_size = target_vocab_size
43
+ model.config.bos_token_id = tokenizer_target.bos_token_id
44
+ model.config.eos_token_id = tokenizer_target.eos_token_id
45
+
46
+ # Save
47
+ model.save_pretrained(output_dir, tie_word_embeddings = tied)
48
+ tokenizer_target.save_pretrained(output_dir)
49
+
50
+ # This is more reliable since save_pretrained seems to gives you a messed up model with some architectures,
51
+ # but it requires manually copying and modifying config.json etc.:
52
+ #
53
+ # save_file(model.state_dict(), os.path.join(args.output_dir, "model.safetensors"), metadata = {'format': 'pt'})