Fix model file path to match repo structure
Browse files
training_files/convert-hf-to-pth-16b.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
-
#Convert hf to pth
|
2 |
import os
|
3 |
import json
|
4 |
|
5 |
import torch
|
6 |
from transformers import LlamaTokenizer, LlamaForCausalLM
|
7 |
|
8 |
-
tokenizer = LlamaTokenizer.from_pretrained("
|
9 |
|
10 |
base_model = LlamaForCausalLM.from_pretrained(
|
11 |
-
"
|
12 |
load_in_8bit=False,
|
13 |
torch_dtype=torch.float16,
|
14 |
device_map={"": "cpu"},
|
@@ -29,18 +29,21 @@ n_heads = params["n_heads"]
|
|
29 |
dim = params["dim"]
|
30 |
dims_per_head = dim // n_heads
|
31 |
base = 10000.0
|
32 |
-
inv_freq = 1.0 /
|
|
|
33 |
|
34 |
|
35 |
def permute(w):
|
36 |
return (
|
37 |
-
w.view(n_heads, dim // n_heads // 2, 2,
|
|
|
38 |
)
|
39 |
|
40 |
|
41 |
def unpermute(w):
|
42 |
return (
|
43 |
-
w.view(n_heads, 2, dim // n_heads // 2,
|
|
|
44 |
)
|
45 |
|
46 |
|
@@ -96,7 +99,7 @@ torch.save(new_state_dict, "consolidated.00.pth")
|
|
96 |
with open("params.json", "w") as f:
|
97 |
json.dump(params, f)
|
98 |
|
99 |
-
#Resize tensors
|
100 |
model = torch.load("consolidated.00.pth", map_location=torch.device('cpu'))
|
101 |
x = model["tok_embeddings.weight"]
|
102 |
y = model["output.weight"]
|
@@ -106,4 +109,4 @@ y = y[:row_exclude]
|
|
106 |
model["tok_embeddings.weight"] = x
|
107 |
model["output.weight"] = y
|
108 |
torch.save(model, "consolidated.01.pth")
|
109 |
-
#Delete consolidated.00.pth and rename consolidated.01.pth into consolidated.00.pth
|
|
|
1 |
+
# Convert hf to pth
|
2 |
import os
|
3 |
import json
|
4 |
|
5 |
import torch
|
6 |
from transformers import LlamaTokenizer, LlamaForCausalLM
|
7 |
|
8 |
+
tokenizer = LlamaTokenizer.from_pretrained("../7B-2nd-train")
|
9 |
|
10 |
base_model = LlamaForCausalLM.from_pretrained(
|
11 |
+
"../7B-2nd-train",
|
12 |
load_in_8bit=False,
|
13 |
torch_dtype=torch.float16,
|
14 |
device_map={"": "cpu"},
|
|
|
29 |
dim = params["dim"]
|
30 |
dims_per_head = dim // n_heads
|
31 |
base = 10000.0
|
32 |
+
inv_freq = 1.0 / \
|
33 |
+
(base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
34 |
|
35 |
|
36 |
def permute(w):
|
37 |
return (
|
38 |
+
w.view(n_heads, dim // n_heads // 2, 2,
|
39 |
+
dim).transpose(1, 2).reshape(dim, dim)
|
40 |
)
|
41 |
|
42 |
|
43 |
def unpermute(w):
|
44 |
return (
|
45 |
+
w.view(n_heads, 2, dim // n_heads // 2,
|
46 |
+
dim).transpose(1, 2).reshape(dim, dim)
|
47 |
)
|
48 |
|
49 |
|
|
|
99 |
with open("params.json", "w") as f:
|
100 |
json.dump(params, f)
|
101 |
|
102 |
+
# Resize tensors
|
103 |
model = torch.load("consolidated.00.pth", map_location=torch.device('cpu'))
|
104 |
x = model["tok_embeddings.weight"]
|
105 |
y = model["output.weight"]
|
|
|
109 |
model["tok_embeddings.weight"] = x
|
110 |
model["output.weight"] = y
|
111 |
torch.save(model, "consolidated.01.pth")
|
112 |
+
# Delete consolidated.00.pth and rename consolidated.01.pth into consolidated.00.pth
|
training_files/convert-hf-to-pth-32b.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
-
#Convert hf to pth
|
2 |
import os
|
3 |
import json
|
4 |
|
5 |
import torch
|
6 |
from transformers import LlamaTokenizer, LlamaForCausalLM
|
7 |
|
8 |
-
tokenizer = LlamaTokenizer.from_pretrained("
|
9 |
|
10 |
base_model = LlamaForCausalLM.from_pretrained(
|
11 |
-
"
|
12 |
load_in_8bit=False,
|
13 |
torch_dtype=torch.float16,
|
14 |
device_map={"": "cpu"},
|
@@ -29,18 +29,21 @@ n_heads = params["n_heads"]
|
|
29 |
dim = params["dim"]
|
30 |
dims_per_head = dim // n_heads
|
31 |
base = 10000.0
|
32 |
-
inv_freq = 1.0 /
|
|
|
33 |
|
34 |
|
35 |
def permute(w):
|
36 |
return (
|
37 |
-
w.view(n_heads, dim // n_heads // 2, 2,
|
|
|
38 |
)
|
39 |
|
40 |
|
41 |
def unpermute(w):
|
42 |
return (
|
43 |
-
w.view(n_heads, 2, dim // n_heads // 2,
|
|
|
44 |
)
|
45 |
|
46 |
|
|
|
1 |
+
# Convert hf to pth
|
2 |
import os
|
3 |
import json
|
4 |
|
5 |
import torch
|
6 |
from transformers import LlamaTokenizer, LlamaForCausalLM
|
7 |
|
8 |
+
tokenizer = LlamaTokenizer.from_pretrained("../7B-2nd-train")
|
9 |
|
10 |
base_model = LlamaForCausalLM.from_pretrained(
|
11 |
+
"../7B-2nd-train",
|
12 |
load_in_8bit=False,
|
13 |
torch_dtype=torch.float16,
|
14 |
device_map={"": "cpu"},
|
|
|
29 |
dim = params["dim"]
|
30 |
dims_per_head = dim // n_heads
|
31 |
base = 10000.0
|
32 |
+
inv_freq = 1.0 / \
|
33 |
+
(base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
34 |
|
35 |
|
36 |
def permute(w):
|
37 |
return (
|
38 |
+
w.view(n_heads, dim // n_heads // 2, 2,
|
39 |
+
dim).transpose(1, 2).reshape(dim, dim)
|
40 |
)
|
41 |
|
42 |
|
43 |
def unpermute(w):
|
44 |
return (
|
45 |
+
w.view(n_heads, 2, dim // n_heads // 2,
|
46 |
+
dim).transpose(1, 2).reshape(dim, dim)
|
47 |
)
|
48 |
|
49 |
|